In [3]:
import os
import tensorflow as tf
import itertools
import numpy as np
from numpy import unravel_index
from random import choice, random, sample
from operator import itemgetter
np.random.seed(1337)  # for reproducibility
from keras.models import Sequential, Model, load_model
from keras.layers import Input, BatchNormalization, Reshape
from keras.layers import Dense, Dropout, Activation, Flatten, LocallyConnected2D
from keras.layers import Conv2D, MaxPooling2D, AlphaDropout, ConvLSTM2D, AvgPool2D, Conv2DTranspose, UpSampling2D
from keras.layers import add, concatenate, multiply
from keras.initializers import VarianceScaling, RandomUniform
from keras.optimizers import Adam, SGD, rmsprop
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import np_utils, multi_gpu_model
from keras.utils.vis_utils import plot_model
from keras.engine.topology import Container
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.regularizers import l1, l2
import keras.backend as K
K.set_image_dim_ordering('th')

from keras.callbacks import Callback, ReduceLROnPlateau, LearningRateScheduler, TensorBoard, ModelCheckpoint
import matplotlib.pyplot as plt
import time
%matplotlib notebook

Using TensorFlow backend.


In [4]:
class Ataxx:
    def __init__(self, board=None):
        if board is None:                  # if there is no initialization given
            self.data = np.zeros((7, 7))   # then generate a board with starting init, and black(-1) takes first turn
            self.data[0, 0] = -1           
            self.data[6, 6] = -1
            self.data[0, 6] = 1
            self.data[6, 0] = 1
        else:
            self.data = board
            
    def get_feature_map(self, turn):
        friendly_map = (self.data == turn).astype(float)
        opponent_map = (self.data == -turn).astype(float)
        return np.concatenate((friendly_map.reshape((1, 7, 7)), opponent_map.reshape((1, 7, 7))), axis=0)
    
    def get_mask(self, turn, init=0.001):   # zero(init) out all illegal move positions
        moves = self.get_moves(turn)
        moves = list(zip(*self.get_moves(turn)))
        if len(moves) == 0:
            return np.zeros((2, 7, 7))
        else:
            start_pos = tuple(zip(*moves[0]))
            end_pos = tuple(zip(*moves[1]))
            start_mask = np.ones((7, 7)) * init
            start_mask[start_pos] = 1
            end_mask = np.ones((7, 7)) * init
            end_mask[end_pos] = 1
            return np.concatenate((start_mask.reshape((1, 7, 7)), end_mask.reshape((1, 7, 7))), axis=0)
    
    def plot(self, is_next_move=False, turn=None):                        # plot the board
        image = self.data.copy()
        if is_next_move:
            if turn not in [-1, 1]:
                raise ValueError("Turn must be -1 or 1, or Must input a turn for next moves")
            else:
                next_moves = self.get_moves(turn)
                if len(next_moves) == 0:
                    raise ValueError("Game is over already")
                next_pos = list(zip(*next_moves))[1]
                for pos in next_pos:
                    image[pos] = turn / 2
        plt.imshow(image, cmap='gray')
        plt.xticks(range(7), range(7))
        plt.yticks(range(7), range(7))
        plt.show()
                
    def is_valid(self, turn, pos):
        if turn not in [-1, 1]:
            raise ValueError("Turn must be -1 or 1") 
        elif self.data[pos] != 0:
            return False
        else:
            for dr in range(-2, 3):
                for dc in range(-2, 3):
                    pos_tmp = (pos[0]+dr, pos[1]+dc)
                    if pos_tmp[0] >= 0 and pos_tmp[1] >= 0 and pos_tmp[0] < 7 and pos_tmp[1] < 7:
                        if self.data[pos_tmp] == turn:  # convert any piece of the opponent to 'turn'
                            return True
            return False
        
    def get_moves(self, turn):
        if turn not in [-1, 1]:
            raise ValueError("Turn must be -1 or 1")
        else:
            next_moves = []
            for r in range(7):
                for c in range(7):
                    has_duplicate_move = False      # move within the radius of one of another friendly piece is called
                    if self.is_valid(turn, (r, c)): # duplicate move
                        for dr in range(-2, 3):
                            for dc in range(-2, 3):
                                if abs(dr) <= 1 and abs(dc) <=1 and has_duplicate_move: 
                                    continue        # no need to record same move again
                                else:
                                    pos_tmp = (r+dr, c+dc)
                                    if pos_tmp[0] >= 0 and pos_tmp[1] >= 0 and pos_tmp[0] < 7 and pos_tmp[1] < 7:
                                        if self.data[pos_tmp] == turn:
                                            next_moves.append((pos_tmp, (r, c)))
            return next_moves
    
    def get_greedy_move(self, turn):
        moves = self.get_moves(turn)
        if len(moves) == 0:
            raise ValueError('No Possible Moves')
        store_data = self.data.copy()
        best_score = -1000000
        for move in moves:
            self.data = store_data.copy()
            self.move_to(turn, move[0], move[1])
            score = self.evaluate(turn, -turn)
            if score > best_score:
                best_move = move
                best_score = score
        self.data = store_data
        return best_move
        
    def move_to(self, turn, pos0, pos1):
        if turn not in [-1, 1]:
            raise ValueError("Turn must be -1 or 1") 
        elif not self.is_valid(turn, pos1):
            raise ValueError("This move: " + str((pos0, pos1)) + " of turn: " + str(turn) + " is invalid") 
        elif self.data[pos0] != turn:
            raise ValueError("The starting position is not your piece")
        else:
            dis = np.array(pos1) - np.array(pos0)    # check if is jump move or duplicate move
            if abs(dis[0]) > 1 or abs(dis[1]) > 1:   # jump move
                self.data[pos0] = 0
                self.data[pos1] = turn
            else:                                    # duplicate move
                self.data[pos1] = turn
            for dr in range(-1, 2):                  # infection mode!!!!
                for dc in range(-1, 2):
                    pos_tmp = (pos1[0]+dr, pos1[1]+dc)
                    if pos_tmp[0] >= 0 and pos_tmp[1] >= 0 and pos_tmp[0] < 7 and pos_tmp[1] < 7:
                        if self.data[pos_tmp] == -turn:  # convert any piece of the opponent to 'turn'
                            self.data[pos_tmp] = turn
                            
    def simu_move_to(self, turn, pos0, pos1):
        if turn not in [-1, 1]:
            raise ValueError("Turn must be -1 or 1") 
        elif not self.is_valid(turn, pos1):
            raise ValueError("This move: " + str((pos0, pos1)) + " of turn: " + str(turn) + " is invalid") 
        elif self.data[pos0] != turn:
            raise ValueError("You are not moving your piece")
        else:
            tmp_data = self.data.copy()
            dis = np.array(pos1) - np.array(pos0)    # check if is jump move or duplicate move
            if abs(dis[0]) > 1 or abs(dis[1]) > 1:   # jump move
                tmp_data[pos0] = 0
                tmp_data[pos1] = turn
            else:                                    # duplicate move
                tmp_data[pos1] = turn
            for dr in range(-1, 2):                  # infection mode!!!!
                for dc in range(-1, 2):
                    pos_tmp = (pos1[0]+dr, pos1[1]+dc)
                    if pos_tmp[0] >= 0 and pos_tmp[1] >= 0 and pos_tmp[0] < 7 and pos_tmp[1] < 7:
                        if tmp_data[pos_tmp] == -turn:  # convert any piece of the opponent to 'turn'
                            tmp_data[pos_tmp] = turn
            return tmp_data
    
    def evaluate(self, turn, this_turn, max_score=100, min_score=0.1):
        if turn not in [-1, 1]:
            raise ValueError("Turn must be -1 or 1") 
        else:
            turn_no = 0
            op_no = 0
            for r in range(7):
                for c in range(7):
                    if self.data[r, c] == turn:
                        turn_no += 1
                    elif self.data[r, c] == -turn:
                        op_no += 1
            if turn_no + op_no == 49:
                if turn_no > op_no:
                    return max_score
                else:
                    return -max_score
            else:
                if len(self.get_moves(this_turn)) == 0:# if one of them can no longer move, count and end
                    if turn_no > op_no:
                        return max_score
                    else:
                        return -max_score
                else:
                    value = turn_no - op_no
            return value * min_score

In [5]:
class Relay():
    def __init__(self, max_size):
        self.relay = []
        self.max_size = max_size

    def reset(self):
        self.relay = []
        
    def add_sample(self, new_sample):
        if len(self.relay) >= self.max_size:
            self.relay.pop(0)
        self.relay.append(new_sample)
    
    def get_batch(self, batch_size):
        if len(self.relay) < batch_size:
            print('not enough sample')
            raise ValueError('relay not large enough')
        else:
            batch = sample(self.relay, batch_size)
            # generate random array to decide the orientation of the data augmentation
            is_flip = np.random.random(batch_size) < 0.5
            rot_times = (np.random.random(batch_size) * 4).astype(int)
            
            cur_state = np.array([self.augmentation(item[0], is_flip[i], rot_times[i]) 
                                  for i, item in enumerate(batch)]).reshape(-1, 2, 7, 7)
            cur_mask = np.array([self.augmentation(item[1], is_flip[i], rot_times[i])
                                 for i, item in enumerate(batch)]).reshape(-1, 2, 7, 7)
            cur_action = np.array([self.augmentation(item[2], is_flip[i], rot_times[i])
                                   for i, item in enumerate(batch)]).reshape(-1, 2, 7, 7)
            try:
                cur_reward = np.array([item[3] for item in batch]).reshape(-1)
                new_state = np.array([self.augmentation(item[4], is_flip[i], rot_times[i])
                                      for i, item in enumerate(batch)]).reshape(-1, 2, 7, 7)
                new_mask = np.array([self.augmentation(item[5], is_flip[i], rot_times[i])
                                      for i, item in enumerate(batch)]).reshape(-1, 2, 7, 7)
            except:
                cur_reward = None
                new_state = None
                new_mask = None
            return cur_state, cur_mask, cur_action, cur_reward, new_state, new_mask
    
    def augmentation(self, data, is_flip, rot_time): # data must have three dimentions and depth first
        if data is None:
            return None
        else:
            if is_flip:
                data = np.fliplr(data)
            data = np.rot90(data, k=rot_time, axes=(1, 2))
            return data
    
    def size(self):
        return len(self.relay)

# BatchNormalization helps in hard-to-tune lr situations

In [15]:
def neg_log_loss(y_true, y_pred):
    y_true = K.reshape(y_true, (-1, 1, 98))
    y_pred = K.reshape(y_pred, (-1, 1, 98))
    y_log = K.log(y_pred)
    y_concate = K.concatenate([y_true, y_log], axis=1)
    y_prod = K.prod(y_concate, axis=1)
    return -K.sum(y_prod)


class CriticNetwork(object):
    def __init__(self, TAU=0.001, LEARNING_RATE=0):
        self.TAU = TAU
        self.LEARNING_RATE = LEARNING_RATE

        #Now create the model
        with tf.device("/cpu:0"):
            self.model = self.create_critic_network() 
            self.target_model = self.create_critic_network() 
            self.target_model.set_weights(self.model.get_weights())
        
        try:
            self.target_model.load_weights("pretrained_critic_model.h5")
            self.pretrained = True
        except:
            self.pretrained = False
        
    def target_train(self):
        critic_weights = self.model.get_weights()
        critic_target_weights = self.target_model.get_weights()
        for i in range(len(critic_weights)):
            critic_target_weights[i] = self.TAU * critic_weights[i] + (1 - self.TAU)* critic_target_weights[i]
        self.target_model.set_weights(critic_target_weights)
        
    def save_target_pretrain(self):
        self.target_model.save_weights("pretrained_critic_model.h5")

    def create_critic_network(self):
        board_input = Input(shape=(2, 7, 7))
        x = Conv2D(32, (1, 1), activation='relu', padding='same')(board_input)
        x = BatchNormalization(axis=1)(x)
        board_feature = MaxPooling2D((2, 2))(x)

        action_input = Input(shape=(2, 7, 7))
        x = Conv2D(32, (1, 1), activation='relu', padding='same')(action_input)
        x = BatchNormalization(axis=1)(x)
        action_feature = MaxPooling2D((2, 2))(x)
        
        overall_feature = concatenate([board_input, action_input], axis=1)
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(overall_feature)
        x = BatchNormalization(axis=1)(x)
        x = MaxPooling2D((2, 2))(x)
        
        x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        x = BatchNormalization(axis=1)(x)
        x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        x = BatchNormalization(axis=1)(x)
        x = Flatten()(x)
        
        x = Dense(256, activation='relu')(x)
        x = BatchNormalization(axis=1)(x)
        x = Dense(1)(x)
        # construct model
        model = Model(inputs=[board_input, action_input], outputs=[x])
        print("actor")
        print(model.summary())
        # sgd = SGD(lr=self.LEARNING_RATE, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='mse', optimizer=Adam(lr=self.LEARNING_RATE, decay=1e-6), metrics=['mse'])
        return model
    
class ActorNetwork(object):
    def __init__(self, TAU=0.001, LEARNING_RATE=0, pretrain_epoch=2, is_train=True, \
                 file_name='target_actor_model.h5'):
        if is_train:
            self.TAU = TAU
            self.LEARNING_RATE = LEARNING_RATE
            
            #Now create the model
            with tf.device("/cpu:0"):
                self.model= self.create_actor_network()   
                self.target_model= self.create_actor_network() 
            if not pretrain_epoch == 0:
                try:
                    self.model.load_weights('pretrained_actor_model.h5')
                    print("succesfully loaded the pretrained model")
                except:
                    print("start to pre train with greedy")
                    self.learn_rules(batch_size=512, epochs=pretrain_epoch)
                    self.model.save_weights('pretrained_actor_model.h5')
            self.update_target()
        else:
            try:
                self.target_model = load_model(file_name)
                print("successfully loaded trained target model")
            except:
                raise ValueError('Can not load target actor model')
    
    def save(self):
        self.target_model.save('target_actor_model.h5')
        print("target_model successfully saved")

    def update_target(self):
        actor_weights = self.model.get_weights()
        self.target_model.set_weights(actor_weights)

    def create_actor_network(self):
        board_input = Input(shape=(2, 7, 7))
        mask_input = Input(shape=(2, 7, 7))
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(board_input)
        x = BatchNormalization(axis=1)(x)
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
        x = BatchNormalization(axis=1)(x)
        x = MaxPooling2D((2, 2))(x)
        
        x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        x = BatchNormalization(axis=1)(x)
        x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        x = BatchNormalization(axis=1)(x)
        x = Flatten()(x)
        
        x = Dense(98, activation='softplus')(x)
        x = Reshape((2, 7, 7))(x) 
        x = multiply(inputs=[x, mask_input])     # this mask will mask any illegal move
        x = Reshape((2, 49))(x)
        x = Activation('softmax')(x)
        x = Reshape((2, 7, 7))(x) 
        model = Model(input=[board_input, mask_input],output=[x])
        print(model.summary())
        # sgd = SGD(lr=self.LEARNING_RATE, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=1.)
        model.compile(loss=neg_log_loss, optimizer=Adam(lr=self.LEARNING_RATE, decay=1e-6))
        return model
    
    def get_action_prob(self, feature_map, action_mask, is_target=False, get_batch=False):
        if not is_target:
            result = self.model.predict([np.reshape(feature_map, (-1, 2, 7, 7)), 
                                      np.reshape(action_mask, (-1, 2, 7, 7))])
        else:
            result = self.target_model.predict([np.reshape(feature_map, (-1, 2, 7, 7)), 
                                      np.reshape(action_mask, (-1, 2, 7, 7))])
        if get_batch:
            return result
        else:
            return result[0]
    
    def get_stochastic_action(self, feature_map, action_mask, is_target=False):
        # get action_prob
        action_prob = self.get_action_prob(feature_map, action_mask, is_target)
        # get valid_moves list
        game = Ataxx(board=feature_map[0] - feature_map[1])
        valid_moves = game.get_moves(1)
        index_list = range(len(valid_moves))
        # store pred prob for each valid moves and sort it
        valid_moves_prob = np.zeros(len(valid_moves))
        for i, move in enumerate(valid_moves):
            valid_moves_prob[i] = action_prob[0][move[0]] + action_prob[1][move[1]]
        # noramlize probablity
        valid_moves_prob = valid_moves_prob / valid_moves_prob.sum()
        # sample a policy according to predicted prob
        move_index = np.random.choice(range(len(valid_moves)), p=valid_moves_prob)
        move = valid_moves[move_index]
        # generate 7*7 pos map
        start_pos = np.zeros((7, 7))
        start_pos[move[0]] = 1
        end_pos = np.zeros((7, 7))
        end_pos[move[1]] = 1
        return move[0], move[1], np.concatenate((start_pos.reshape(1, 7, 7), end_pos.reshape(1, 7, 7)), axis=0)
    
    def get_batch_best_action(self, feature_map, action_mask, is_target=False):
        # calc batch size
        batch_size = feature_map.shape[0]
        # get action_prob
        action_prob = self.get_action_prob(feature_map, action_mask, is_target, get_batch=True)

        # get action map
        action_map = np.zeros(feature_map.shape)
        for k in range(batch_size):
            if feature_map[k].any():
                game = Ataxx(board=feature_map[k][0] - feature_map[k][1])
                valid_moves = game.get_moves(1)
                max_prob_sum = -1
                pos0 = (0, 0)
                pos1 = (0, 0)
                for move in valid_moves:
                    tmp_prob_sum = action_prob[k][0][move[0]] + action_prob[k][1][move[1]]
                    if tmp_prob_sum > max_prob_sum:
                        max_prob_sum = tmp_prob_sum
                        pos0 = move[0]
                        pos1 = move[1]
                action_map[k][0][pos0] = 1
                action_map[k][1][pos1] = 1
        return action_map
                
    
    def get_best_action(self, feature_map, action_mask, is_target=False, verbose=True):
        # get action_prob
        action_prob = self.get_action_prob(feature_map, action_mask, is_target)
        # get valid_moves list
        game = Ataxx(board=feature_map[0] - feature_map[1])
        valid_moves = game.get_moves(1)
        max_prob_sum = -1
        pos0 = (0, 0)
        pos1 = (0, 0)
        for move in valid_moves:
            tmp_prob_sum = action_prob[0][move[0]] + action_prob[1][move[1]]
            if tmp_prob_sum > max_prob_sum:
                max_prob_sum = tmp_prob_sum
                pos0 = move[0]
                pos1 = move[1]
        
        # output the max_prob
        if verbose:
            r = np.random.random()
            if r < 0.01:
                print("confidence of this best move is: ", max_prob_sum)
            
        # generate 7*7 pos map
        start_pos = np.zeros((7, 7))
        start_pos[pos0] = 1
        end_pos = np.zeros((7, 7))
        end_pos[pos1] = 1
        
        return pos0, pos1, np.concatenate((start_pos.reshape(1, 7, 7), end_pos.reshape(1, 7, 7)), axis=0)

    def data_generator(self, batch_size, max_size=10000):
        relay = Relay(max_size)
        game = Ataxx()
        turn = -1
        i = 0
        while True: 
            i += 1
            new_sample = []
            new_sample.append(game.get_feature_map(turn))
            new_sample.append(game.get_mask(turn))
            
            try:
                greedy_move = game.get_greedy_move(turn)
            except:
                game = Ataxx()
                turn = -1
                continue
            start_prob_map = np.zeros((7, 7))
            start_prob_map[greedy_move[0]] = 1
            end_prob_map = np.zeros((7, 7))
            end_prob_map[greedy_move[1]] = 1
                
            new_sample.append(np.concatenate((start_prob_map.reshape(1, 7, 7), end_prob_map.reshape(1, 7, 7)), axis=0))
            new_sample.extend([None, None, None])
            relay.add_sample(new_sample)
            
            move = choice(game.get_moves(turn))
            game.move_to(turn, move[0], move[1])
            turn = -turn
            
            if not i % 16 and relay.size() > batch_size*4:
                state, mask, action, _, _, _ = relay.get_batch(batch_size)
                yield [state, mask], action  
                
    def learn_rules(self, batch_size, epochs=10):
        tmp_game = Ataxx()
        print("trained action prob map predicted by initial model for a starting game")
        print(self.get_action_prob(tmp_game.get_feature_map(-1), tmp_game.get_mask(-1), is_target=True))
        sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        gpu_model = multi_gpu_model(self.model, gpus=4)
        gpu_model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['acc'])
        gpu_model.fit_generator(self.data_generator(batch_size), steps_per_epoch=256, epochs=epochs, 
                                    verbose=1,
                                    validation_data=self.data_generator(32), 
                                    validation_steps=1,
                                    workers=16,
                                    use_multiprocessing=True,
                                    callbacks=[ReduceLROnPlateau(cooldown=5, min_lr=0.01, patience=2)])
        
        # set target network and learning network to be the same
        self.update_target()
        
        tmp_game = Ataxx()
        print("trained action prob map predicted by initial model for a starting game")
        print(self.get_action_prob(tmp_game.get_feature_map(-1), tmp_game.get_mask(-1), is_target=True))

In [16]:
def reinforcement_learning(epochs=1000, check_point=True, step_per_epoch=50, \
                           BATCH_SIZE=64, TRAIN_BATCH_TIME=1, GAMMA=0.99, RELAY_SIZE=50000, TAU=0.001, LRA=0.0001, LRC=0.0001):        
    actor = ActorNetwork(TAU, LRA, 32)
    critic = CriticNetwork(TAU, LRC)
    relay = Relay(RELAY_SIZE)

    max_score = 1
    min_score = 0.001
    
    for ep in range(epochs):
        reward = 0
        step = 0

        game = Ataxx()
        turn = -1 # start turn is -1

        while abs(game.evaluate(turn, turn, max_score, min_score)) != max_score:
            new_sample = [] # prepare to collect new sample
            
            cur_feature = game.get_feature_map(turn)
            cur_mask = game.get_mask(turn)
            
            pos0, pos1, cur_action = actor.get_stochastic_action(cur_feature, cur_mask) # get proposed action
            
            # new_state = game.simu_move_to(turn, pos0, pos1)
            
            # create new_sample and add that to relay
            new_sample.append(cur_feature) # cur_state
            new_sample.append(cur_mask) # cur_mask
            new_sample.append(cur_action) # cur_action
            
            game.move_to(turn, pos0, pos1) # do the move here if the move is valid
            turn = -turn # the turn now is the next turn, what we are learning for is -turn
            
            new_sample.append(game.evaluate(-turn, turn, max_score, min_score)) # cur_reward
            if abs(new_sample[-1]) == max_score:
                new_sample.append(np.zeros((2, 7, 7))) 
                new_sample.append(np.zeros((2, 7, 7))) 
            else:
                new_sample.append(game.get_feature_map(turn)) # new_state
                new_sample.append(game.get_mask(turn)) # new_mask
                
            relay.add_sample(new_sample)
            step += 1
        
        print("Episode : " + str(ep) + " Replay Buffer " + str(relay.size()))
        print("TOTAL REWARD @ " + str(ep) +"-th Episode  : Reward " + str(game.evaluate(1, turn, max_score, min_score)))
        print("Total Step: " + str(step))
        
        #Do the batch update....if possible
        if relay.size() > BATCH_SIZE * 2:
            for t in range(step_per_epoch):
                cur_state, cur_mask, cur_action, \
                    cur_reward, new_state, new_mask = relay.get_batch(BATCH_SIZE)
                q_target = cur_reward
                critic_sample_weight = np.ones(BATCH_SIZE)
                
                # set bias training weight and store all new actions
                bias_sample_weight = 10
                new_action = np.zeros((BATCH_SIZE, 2, 7, 7))
                not_end_of_game = np.ones(BATCH_SIZE)
                
                # using batch action prediction leads to a 5 times improvement in speed
                print("start searching new action")
                start = time.time()
                # generate new action using target model
                new_action = actor.get_batch_best_action(new_state, new_mask, is_target=True)
                for k in range(BATCH_SIZE):
                    if not new_state[k].any():
                        critic_sample_weight[k] = bias_sample_weight
                        not_end_of_game[k] = 0
                print("get new action: ", time.time()-start)            
                  
                print("start getting new_q")
                start = time.time()
                q_new = critic.target_model.predict([new_state, new_action]).reshape(BATCH_SIZE)
                q_target += - GAMMA*not_end_of_game*q_new
                # log loss update weight equal to critic output    
                actor_sample_weight = q_target
                print("get new_q: ", time.time()-start)            
                 
                print("train start:")
                start = time.time()
                # train models
                for nb in range(TRAIN_BATCH_TIME):
                    print("critic loss", critic.model.train_on_batch([cur_state, cur_action], q_target,\
                                                                     sample_weight=critic_sample_weight))
                if not critic.pretrained:
                    if ep == 150:
                        critic.save_target_pretrain()
                        critic.pretrained = True
                else:
                    print("actor loss", actor.model.train_on_batch([cur_state, cur_mask], cur_action,\
                                                                    sample_weight=actor_sample_weight))
                # update target critic models slowly for stability
                critic.target_train()
                print("train end", time.time()-start)
                
                for k in range(BATCH_SIZE):
                    if not_end_of_game[k] == 0:
                        q_p = critic.model.predict([cur_state[k].reshape(1, 2, 7, 7), \
                                             cur_action[k].reshape(1, 2, 7, 7)])[0]
                        q_p_t = critic.target_model.predict([cur_state[k].reshape(1, 2, 7, 7), \
                                             cur_action[k].reshape(1, 2, 7, 7)])[0]
                        print("should be {}, predicted: {}, target predicted: {}".format(q_target[k], q_p, q_p_t))
        
                        
        # performance checking
        if (ep + 1) % 10 == 0:
            tmp_game = Ataxx()
            print("trained action prob map predicted by initial model for a starting game")
            print(actor.get_action_prob(tmp_game.get_feature_map(-1), tmp_game.get_mask(-1), is_target=False))
        
        # update and check point model
        if (ep+1) % 300 == 0:
            actor.update_target()
            if check_point:
                actor.save()

    print("Finish.")
    actor.save()
    return actor

## Ideas:
1. when comes to new state in training, we should make sure we use target network for both actor and crtic model, in order to have more stability
2. We might will need to keep relay_size smaller so that actor can always learn on a more recent version of target actor network
3. Normalizing that action probability so that the chosen action becomes 1 while other actions becomes 0 is reasonable, because firstly, it makes it easier for crtic model to get a stable output and maintain the property of discrete policys, at least a same critic output for the same choice of policy, secondly, it still helps the guiding of the training on policy model, cuz bad gradient still will be passed if the critic output is bad, and as long as we train for only a few times (maybe one is the most reasonable one..), the property of discrete policy maintains even if we do not change the action mask. 
4. The confidence of the target model of its prediction seems to be a good indication for convergence, cuz that's the only indicator that is monotonic during the training process。。。。
5. The natural phenomenom of training is that, in the begining, loss for actor is close to 0, as critic is just initialized and its output will always be that. For a while, constant increase will be observed in actor loss, as critic is starting to learn larger values while actor is still basically a random picker. After a while, when critic loss has decreased a lot, actor loss should also start to decrease, and this period should be the actual training for actor model.
6. During training, it can be helpful to set the sample weight for 1 and -1 to be very large, cuz it is (1)the hardest to learn considering the around zero init, and (2) most important way to understand rules
7. Setting learning rate for critic to be higher might be reasonable as it helps q converge faster


In [17]:
actor = reinforcement_learning(epochs=2000, check_point=True, step_per_epoch=8, 
                               RELAY_SIZE=50000, BATCH_SIZE=256, TRAIN_BATCH_TIME=5, LRC=5e-5, LRA=1e-5, TAU=0.01)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 2, 7, 7)      0                                            
__________________________________________________________________________________________________
conv2d_37 (Conv2D)              (None, 64, 7, 7)     1216        input_17[0][0]                   
__________________________________________________________________________________________________
batch_normalization_41 (BatchNo (None, 64, 7, 7)     256         conv2d_37[0][0]                  
__________________________________________________________________________________________________
conv2d_38 (Conv2D)              (None, 64, 7, 7)     36928       batch_normalization_41[0][0]     
__________________________________________________________________________________________________
batch_norm

succesfully loaded the pretrained model
actor
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 2, 7, 7)      0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 2, 7, 7)      0                                            
__________________________________________________________________________________________________
concatenate_5 (Concatenate)     (None, 4, 7, 7)      0           input_21[0][0]                   
                                                                 input_22[0][0]                   
__________________________________________________________________________________________________
conv2d_47 (Conv2D)              (None, 64, 7, 7)     2368      

get new action:  0.2520325183868408
start getting new_q
get new_q:  0.11342000961303711
train start:
critic loss [1.3358223, 1.2515056]
critic loss [1.2063838, 1.160249]
critic loss [1.0532249, 1.0417418]
critic loss [0.91222358, 0.91154212]
critic loss [0.79160774, 0.78028774]
train end 0.5057463645935059
should be -1.0, predicted: [ 0.10907055], target predicted: [-0.12805255]
should be 1.0, predicted: [ 0.14058764], target predicted: [-0.02119489]
should be 1.0, predicted: [ 0.15637663], target predicted: [-0.02856207]
start searching new action
get new action:  0.2661299705505371
start getting new_q
get new_q:  0.06645941734313965
train start:
critic loss [1.2614779, 1.0255499]
critic loss [1.129199, 0.96823466]
critic loss [0.9662025, 0.89033115]
critic loss [0.81950957, 0.80136907]
critic loss [0.7093209, 0.70720041]
train end 0.46562671661376953
should be 1.0, predicted: [ 0.16327415], target predicted: [-0.07336703]
should be 1.0, predicted: [ 0.18231045], target predicted: [-0

get new action:  0.2458515167236328
start getting new_q
get new_q:  0.06366276741027832
train start:
critic loss [0.84237397, 0.57035613]
critic loss [0.74564892, 0.54679704]
critic loss [0.61577284, 0.51159662]
critic loss [0.50715673, 0.47406667]
critic loss [0.4428159, 0.43820542]
train end 0.40228748321533203
should be 1.0, predicted: [ 0.47366878], target predicted: [ 0.05413502]
should be 1.0, predicted: [ 0.52335656], target predicted: [ 0.05730252]
should be -1.0, predicted: [ 0.32819316], target predicted: [ 0.00787585]
start searching new action
get new action:  0.24584102630615234
start getting new_q
get new_q:  0.06182050704956055
train start:
critic loss [0.69057548, 0.59463328]
critic loss [0.65726012, 0.58100361]
critic loss [0.59559965, 0.54678607]
critic loss [0.52196765, 0.49901429]
critic loss [0.45123568, 0.44521347]
train end 0.392214298248291
should be 1.0, predicted: [ 0.44841358], target predicted: [ 0.03739609]
start searching new action
get new action:  0.2466

Episode : 9 Replay Buffer 1028
TOTAL REWARD @ 9-th Episode  : Reward -1
Total Step: 125
start searching new action
get new action:  0.25383830070495605
start getting new_q
get new_q:  0.06370043754577637
train start:
critic loss [0.63476861, 0.38342196]
critic loss [0.57230234, 0.37184519]
critic loss [0.48213094, 0.35263866]
critic loss [0.3926346, 0.32982859]
critic loss [0.32456559, 0.30645978]
train end 0.4376828670501709
should be 1.0, predicted: [ 0.67515218], target predicted: [ 0.1469283]
should be 1.0, predicted: [ 0.05739093], target predicted: [ 0.14692548]
should be -1.0, predicted: [ 0.07216386], target predicted: [ 0.14018121]
should be -1.0, predicted: [ 0.2488285], target predicted: [ 0.13525736]
start searching new action
get new action:  0.2406158447265625
start getting new_q
get new_q:  0.06232261657714844
train start:
critic loss [0.32110417, 0.32110417]
critic loss [0.31553435, 0.31553435]
critic loss [0.30393034, 0.30393034]
critic loss [0.28708982, 0.28708982]
cr

get new action:  0.23508954048156738
start getting new_q
get new_q:  0.07419347763061523
train start:
critic loss [0.25510055, 0.23990487]
critic loss [0.25335887, 0.24031416]
critic loss [0.24554363, 0.23668647]
critic loss [0.23336737, 0.22897242]
critic loss [0.21870972, 0.21757475]
train end 0.43395471572875977
should be -1.0, predicted: [-0.21977894], target predicted: [ 0.19900632]
should be 1.0, predicted: [ 0.52032453], target predicted: [ 0.26315817]
start searching new action
get new action:  0.22481274604797363
start getting new_q
get new_q:  0.06295371055603027
train start:
critic loss [0.37567782, 0.29523623]
critic loss [0.35488614, 0.2882179]
critic loss [0.31955081, 0.27463609]
critic loss [0.28009412, 0.25682509]
critic loss [0.24508274, 0.2370307]
train end 0.41522741317749023
should be -1.0, predicted: [-0.19885667], target predicted: [ 0.17145389]
should be 1.0, predicted: [ 0.01863388], target predicted: [ 0.24569532]
should be 1.0, predicted: [ 0.34745359], target

critic loss [0.22203642, 0.21746773]
train end 0.3985018730163574
should be 1.0, predicted: [-0.64238006], target predicted: [ 0.28000256]
should be 1.0, predicted: [-0.0780829], target predicted: [ 0.31223208]
should be 1.0, predicted: [-0.31402493], target predicted: [ 0.29270601]
start searching new action
get new action:  0.22908449172973633
start getting new_q
get new_q:  0.06716299057006836
train start:
critic loss [0.44699779, 0.25173718]
critic loss [0.40299582, 0.23863542]
critic loss [0.32833761, 0.21862864]
critic loss [0.2533673, 0.19886355]
critic loss [0.19960223, 0.18379019]
train end 0.39459919929504395
should be 1.0, predicted: [-0.37464353], target predicted: [ 0.26333416]
should be -1.0, predicted: [-0.23756757], target predicted: [ 0.35837761]
should be 1.0, predicted: [-0.41493776], target predicted: [ 0.29637402]
start searching new action
get new action:  0.2405869960784912
start getting new_q
get new_q:  0.06636929512023926
train start:
critic loss [0.28657615, 

get new action:  0.22734880447387695
start getting new_q
get new_q:  0.06375837326049805
train start:
critic loss [0.21009544, 0.18780491]
critic loss [0.19727352, 0.18091363]
critic loss [0.18007636, 0.17076597]
critic loss [0.16340931, 0.15957966]
critic loss [0.15008315, 0.14880311]
train end 0.4437859058380127
should be 1.0, predicted: [-0.76548022], target predicted: [ 0.25524718]
should be -1.0, predicted: [-1.10009515], target predicted: [ 0.17842132]
should be 1.0, predicted: [-0.56315625], target predicted: [ 0.29007897]
start searching new action
get new action:  0.24712491035461426
start getting new_q
get new_q:  0.0704948902130127
train start:
critic loss [0.29715204, 0.20047435]
critic loss [0.27188879, 0.19630148]
critic loss [0.23488842, 0.18991536]
critic loss [0.20041952, 0.18260673]
critic loss [0.1772978, 0.17472726]
train end 0.4138815402984619
should be -1.0, predicted: [-0.63709003], target predicted: [ 0.28798592]
should be 1.0, predicted: [-0.37378842], target p

critic loss [0.22936428, 0.17233653]
critic loss [0.20534, 0.16213065]
critic loss [0.17845885, 0.15092708]
critic loss [0.15453677, 0.14010186]
train end 0.44506263732910156
should be 1.0, predicted: [ 0.50435656], target predicted: [ 0.34087193]
should be 1.0, predicted: [ 0.31223285], target predicted: [ 0.29766172]
should be -1.0, predicted: [-1.18818593], target predicted: [ 0.21261868]
should be 1.0, predicted: [-0.13626963], target predicted: [ 0.32038218]
start searching new action
get new action:  0.23866653442382812
start getting new_q
get new_q:  0.0700840950012207
train start:
critic loss [0.28426176, 0.16774407]
critic loss [0.26957285, 0.16525394]
critic loss [0.23615432, 0.1585799]
critic loss [0.19600798, 0.14957173]
critic loss [0.16090104, 0.1405379]
train end 0.4472689628601074
should be 1.0, predicted: [-0.25290701], target predicted: [ 0.27509463]
should be 1.0, predicted: [-0.0424962], target predicted: [ 0.19468582]
should be 1.0, predicted: [ 0.20444974], target

critic loss [0.1974525, 0.15990672]
critic loss [0.18037081, 0.15300646]
critic loss [0.16107976, 0.14438397]
critic loss [0.14267585, 0.13476038]
train end 0.4187617301940918
should be 1.0, predicted: [ 0.2837823], target predicted: [ 0.33025396]
should be 1.0, predicted: [ 0.43213135], target predicted: [ 0.37681222]
should be 1.0, predicted: [ 0.43360159], target predicted: [ 0.35015997]
should be -1.0, predicted: [-0.68315774], target predicted: [ 0.28126827]
start searching new action
get new action:  0.23864412307739258
start getting new_q
get new_q:  0.06621456146240234
train start:
critic loss [0.34657854, 0.16084085]
critic loss [0.31425548, 0.1538474]
critic loss [0.26060563, 0.14376354]
critic loss [0.20304248, 0.13365638]
critic loss [0.15706217, 0.12587643]
train end 0.40529370307922363
should be 1.0, predicted: [ 0.26190192], target predicted: [ 0.22031437]
should be -1.0, predicted: [-0.71165061], target predicted: [ 0.24219266]
should be 1.0, predicted: [ 0.22279322], t

get new action:  0.2443406581878662
start getting new_q
get new_q:  0.07221245765686035
train start:
critic loss [0.10694425, 0.10694425]
critic loss [0.1040668, 0.1040668]
critic loss [0.10154632, 0.10154632]
critic loss [0.099191122, 0.099191122]
critic loss [0.096605897, 0.096605897]
train end 0.39891529083251953
trained action prob map predicted by initial model for a starting game
[[[  4.74112213e-01   2.26921802e-05   2.26965112e-05   2.27041946e-05
     2.27350065e-05   2.27338332e-05   2.27408400e-05]
  [  2.26640877e-05   2.26761294e-05   2.26807369e-05   2.26782286e-05
     2.26922693e-05   2.26753946e-05   2.26919001e-05]
  [  2.26959255e-05   2.26813208e-05   2.26909469e-05   2.27196178e-05
     2.26813208e-05   2.27249893e-05   2.27216315e-05]
  [  2.26802385e-05   2.26970296e-05   2.26941283e-05   2.27356340e-05
     2.26867924e-05   2.26919656e-05   2.26652755e-05]
  [  2.27070323e-05   2.26959455e-05   2.27087421e-05   2.27080709e-05
     2.26818611e-05   2.26802385e-05

critic loss [0.13600549, 0.092894346]
train end 0.4229452610015869
should be 1.0, predicted: [ 0.66290337], target predicted: [ 0.25684208]
should be 1.0, predicted: [ 0.71862799], target predicted: [ 0.39026147]
should be 1.0, predicted: [ 0.28831196], target predicted: [ 0.2631743]
should be 1.0, predicted: [ 1.43050611], target predicted: [ 0.34584707]
should be 1.0, predicted: [ 0.41804957], target predicted: [ 0.2431723]
should be 1.0, predicted: [ 0.63533062], target predicted: [ 0.33512646]
start searching new action
get new action:  0.24251604080200195
start getting new_q
get new_q:  0.07123923301696777
train start:
critic loss [0.22857898, 0.11280265]
critic loss [0.21085396, 0.10882445]
critic loss [0.18127649, 0.10294541]
critic loss [0.14792734, 0.096325919]
critic loss [0.11753249, 0.089960262]
train end 0.41710782051086426
should be -1.0, predicted: [-0.69679874], target predicted: [ 0.31360453]
should be 1.0, predicted: [ 0.74237639], target predicted: [ 0.12153815]
shou

get new action:  0.23079705238342285
start getting new_q
get new_q:  0.06240534782409668
train start:
critic loss [0.12273575, 0.12273575]
critic loss [0.12113547, 0.12113547]
critic loss [0.1179618, 0.1179618]
critic loss [0.1134174, 0.1134174]
critic loss [0.10778493, 0.10778493]
train end 0.3970503807067871
start searching new action
get new action:  0.23416829109191895
start getting new_q
get new_q:  0.07180356979370117
train start:
critic loss [0.14706931, 0.10140217]
critic loss [0.1402586, 0.099394456]
critic loss [0.12806672, 0.095929742]
critic loss [0.11344588, 0.091539755]
critic loss [0.099211991, 0.086724445]
train end 0.40755271911621094
should be 1.0, predicted: [ 1.0493623], target predicted: [ 0.08068068]
should be 1.0, predicted: [ 0.979307], target predicted: [ 0.28392828]
should be 1.0, predicted: [ 0.91030025], target predicted: [ 0.39251739]
should be 1.0, predicted: [ 0.71931088], target predicted: [ 0.1625703]
should be 1.0, predicted: [ 0.60057706], target pred

Episode : 25 Replay Buffer 2455
TOTAL REWARD @ 25-th Episode  : Reward -1
Total Step: 120
start searching new action
get new action:  0.25498223304748535
start getting new_q
get new_q:  0.06566309928894043
train start:
critic loss [0.1094752, 0.1094752]
critic loss [0.1068559, 0.1068559]
critic loss [0.1043653, 0.1043653]
critic loss [0.1017033, 0.1017033]
critic loss [0.098457769, 0.098457769]
train end 0.4097151756286621
start searching new action
get new action:  0.23501801490783691
start getting new_q
get new_q:  0.06729936599731445
train start:
critic loss [0.089084566, 0.083106592]
critic loss [0.086854577, 0.081632547]
critic loss [0.082476236, 0.078783691]
critic loss [0.076814331, 0.07484147]
critic loss [0.070866853, 0.070226513]
train end 0.43855929374694824
should be 1.0, predicted: [ 1.21219873], target predicted: [ 0.12832882]
should be 1.0, predicted: [ 1.14374375], target predicted: [-0.05640017]
start searching new action
get new action:  0.23548316955566406
start gett

get new action:  0.23478412628173828
start getting new_q
get new_q:  0.06241130828857422
train start:
critic loss [0.14920962, 0.11422283]
critic loss [0.13866141, 0.10894561]
critic loss [0.12451065, 0.10195625]
critic loss [0.10935955, 0.094118193]
critic loss [0.095221668, 0.086108029]
train end 0.43479061126708984
should be 1.0, predicted: [ 0.73842275], target predicted: [ 0.19153814]
should be 1.0, predicted: [ 0.6347487], target predicted: [ 0.17201222]
should be 1.0, predicted: [ 0.7562356], target predicted: [ 0.28103819]
Episode : 27 Replay Buffer 2575
TOTAL REWARD @ 27-th Episode  : Reward -1
Total Step: 9
start searching new action
get new action:  0.2400827407836914
start getting new_q
get new_q:  0.06899333000183105
train start:
critic loss [0.12087666, 0.093498223]
critic loss [0.1111605, 0.089339539]
critic loss [0.099259883, 0.084908411]
critic loss [0.088213541, 0.08091896]
critic loss [0.080136478, 0.077757031]
train end 0.48526453971862793
should be 1.0, predicted: 

get new action:  0.22711491584777832
start getting new_q
get new_q:  0.07129621505737305
train start:
critic loss [0.19995669, 0.10395338]
critic loss [0.18229926, 0.099611096]
critic loss [0.15266363, 0.093169093]
critic loss [0.12079859, 0.086376309]
critic loss [0.095060587, 0.080601871]
train end 0.39715147018432617
should be 1.0, predicted: [ 0.72448474], target predicted: [ 0.2055314]
should be -1.0, predicted: [-0.96673167], target predicted: [ 0.19955668]
should be 1.0, predicted: [ 0.7768783], target predicted: [ 0.21533699]
should be 1.0, predicted: [ 0.61817312], target predicted: [ 0.05342173]
start searching new action
get new action:  0.2502012252807617
start getting new_q
get new_q:  0.07814645767211914
train start:
critic loss [0.14355828, 0.089824267]
critic loss [0.13888806, 0.089309722]
critic loss [0.12888612, 0.08788234]
critic loss [0.11560344, 0.085465059]
critic loss [0.1014308, 0.082180537]
train end 0.5919740200042725
should be 1.0, predicted: [ 0.31026936], t

critic loss [0.086556509, 0.069422185]
train end 0.4618644714355469
should be 1.0, predicted: [ 0.9205687], target predicted: [ 0.2220379]
should be 1.0, predicted: [ 0.84016961], target predicted: [-0.0099994]
should be 1.0, predicted: [ 0.66153234], target predicted: [ 0.12505056]
should be 1.0, predicted: [ 0.56077999], target predicted: [ 0.27147964]
should be 1.0, predicted: [ 0.8795585], target predicted: [ 0.22713484]
start searching new action
get new action:  0.23989510536193848
start getting new_q
get new_q:  0.06947016716003418
train start:
critic loss [0.15335691, 0.088343412]
critic loss [0.1455847, 0.088161968]
critic loss [0.1302613, 0.085875347]
critic loss [0.11197176, 0.082044624]
critic loss [0.094068125, 0.077337071]
train end 0.5500826835632324
should be 1.0, predicted: [ 0.5153538], target predicted: [ 0.15719983]
should be 1.0, predicted: [ 0.90581959], target predicted: [ 0.10078806]
should be 1.0, predicted: [ 0.87749457], target predicted: [-0.02511686]
start 

get new action:  0.24361491203308105
start getting new_q
get new_q:  0.06868767738342285
train start:
critic loss [0.10324088, 0.083181575]
critic loss [0.10055577, 0.082745291]
critic loss [0.094684109, 0.08087007]
critic loss [0.087193504, 0.077952817]
critic loss [0.079563975, 0.074382097]
train end 0.3923606872558594
should be -1.0, predicted: [-0.92283088], target predicted: [ 0.02967283]
should be 1.0, predicted: [ 1.09685576], target predicted: [ 0.30174837]
should be 1.0, predicted: [ 1.21936035], target predicted: [ 0.21221568]
should be 1.0, predicted: [ 0.94377565], target predicted: [ 0.09248955]
start searching new action
get new action:  0.2482755184173584
start getting new_q
get new_q:  0.07690930366516113
train start:
critic loss [0.10647064, 0.068924263]
critic loss [0.10242511, 0.068235338]
critic loss [0.095307097, 0.066866741]
critic loss [0.086487293, 0.065026343]
critic loss [0.077360466, 0.062930055]
train end 0.48315906524658203
should be 1.0, predicted: [ 0.607

get new action:  0.2414228916168213
start getting new_q
get new_q:  0.0625162124633789
train start:
critic loss [0.077212766, 0.077212766]
critic loss [0.075731069, 0.075731069]
critic loss [0.073310345, 0.073310345]
critic loss [0.070157997, 0.070157997]
critic loss [0.066496648, 0.066496648]
train end 0.42995452880859375
Episode : 34 Replay Buffer 3415
TOTAL REWARD @ 34-th Episode  : Reward 1
Total Step: 156
start searching new action
get new action:  0.25264692306518555
start getting new_q
get new_q:  0.07012534141540527
train start:
critic loss [0.072195806, 0.064549677]
critic loss [0.070752837, 0.063443668]
critic loss [0.067978144, 0.061658859]
critic loss [0.064223453, 0.059297279]
critic loss [0.059884969, 0.056478184]
train end 0.434375524520874
should be -1.0, predicted: [-0.74999022], target predicted: [ 0.00873802]
start searching new action
get new action:  0.24012541770935059
start getting new_q
get new_q:  0.0621190071105957
train start:
critic loss [0.2048924, 0.091662

get new action:  0.2230212688446045
start getting new_q
get new_q:  0.07096076011657715
train start:
critic loss [0.11181092, 0.069699332]
critic loss [0.10638456, 0.070431367]
critic loss [0.096959591, 0.069587126]
critic loss [0.085829422, 0.06738615]
critic loss [0.074932471, 0.064177752]
train end 0.40862107276916504
should be 1.0, predicted: [ 1.04569137], target predicted: [-0.31589431]
should be 1.0, predicted: [ 0.72314614], target predicted: [ 0.20287606]
should be 1.0, predicted: [ 0.93964851], target predicted: [ 0.173163]
should be 1.0, predicted: [ 1.11106312], target predicted: [ 0.0017726]
should be 1.0, predicted: [ 1.09309494], target predicted: [ 0.11963461]
should be 1.0, predicted: [ 0.72998732], target predicted: [-0.09123459]
Episode : 36 Replay Buffer 3570
TOTAL REWARD @ 36-th Episode  : Reward 1
Total Step: 135
start searching new action
get new action:  0.2490687370300293
start getting new_q
get new_q:  0.06894922256469727
train start:
critic loss [0.12640525, 

critic loss [0.10219561, 0.071657583]
critic loss [0.092760444, 0.069619067]
critic loss [0.082558945, 0.067328811]
critic loss [0.073564813, 0.065125085]
train end 0.43634033203125
should be 1.0, predicted: [ 1.07168007], target predicted: [ 0.06255002]
should be 1.0, predicted: [ 0.50269866], target predicted: [ 0.07325504]
start searching new action
get new action:  0.23551249504089355
start getting new_q
get new_q:  0.06288456916809082
train start:
critic loss [0.1248494, 0.079699486]
critic loss [0.11976459, 0.078814059]
critic loss [0.10719147, 0.074507616]
critic loss [0.091046095, 0.068401344]
critic loss [0.075684458, 0.06234144]
train end 0.40012335777282715
should be 1.0, predicted: [ 0.96123326], target predicted: [ 0.04066785]
should be 1.0, predicted: [ 0.66408718], target predicted: [-0.10100141]
start searching new action
get new action:  0.22506189346313477
start getting new_q
get new_q:  0.06213665008544922
train start:
critic loss [0.14538644, 0.063186772]
critic los

critic loss [0.073062763, 0.056987077]
critic loss [0.066970602, 0.054643299]
critic loss [0.059651818, 0.051698565]
critic loss [0.052827016, 0.048765913]
train end 0.413790225982666
should be 1.0, predicted: [ 1.07181644], target predicted: [-0.28737271]
should be 1.0, predicted: [ 0.81131279], target predicted: [ 0.09651011]
should be 1.0, predicted: [ 1.12006652], target predicted: [ 0.1440029]
should be 1.0, predicted: [ 0.98514402], target predicted: [-0.2032596]
start searching new action
get new action:  0.24188685417175293
start getting new_q
get new_q:  0.06509709358215332
train start:
critic loss [0.063083135, 0.051473819]
critic loss [0.062712401, 0.0517537]
critic loss [0.060511526, 0.0514118]
critic loss [0.056903969, 0.050336756]
critic loss [0.052563623, 0.048589688]
train end 0.4044053554534912
should be 1.0, predicted: [ 0.95972949], target predicted: [-0.27171442]
should be 1.0, predicted: [ 0.76939654], target predicted: [ 0.0004105]
start searching new action
get n

get new action:  0.23041224479675293
start getting new_q
get new_q:  0.06614923477172852
train start:
critic loss [0.085131958, 0.070653476]
critic loss [0.086824581, 0.073503204]
critic loss [0.085163161, 0.073517025]
critic loss [0.08027298, 0.070609681]
critic loss [0.072880335, 0.06530413]
train end 0.39788246154785156
should be 1.0, predicted: [ 0.97020006], target predicted: [-0.08236643]
should be 1.0, predicted: [ 0.62463176], target predicted: [ 0.11032981]
should be 1.0, predicted: [ 1.20696676], target predicted: [ 0.02979708]
Episode : 41 Replay Buffer 3731
TOTAL REWARD @ 41-th Episode  : Reward 1
Total Step: 124
start searching new action
get new action:  0.26218247413635254
start getting new_q
get new_q:  0.07510805130004883
train start:
critic loss [0.074524358, 0.070809416]
critic loss [0.069619581, 0.065981247]
critic loss [0.064294934, 0.060949236]
critic loss [0.059289627, 0.056416146]
critic loss [0.055089444, 0.052803636]
train end 0.5169477462768555
should be 1.0,

critic loss [0.12955374, 0.059602231]
critic loss [0.11571908, 0.058162183]
critic loss [0.10006772, 0.056707472]
critic loss [0.084917724, 0.055246636]
train end 0.4089162349700928
should be -1.0, predicted: [-0.53354871], target predicted: [-0.39726847]
should be -1.0, predicted: [-0.79732728], target predicted: [-0.28415567]
should be 1.0, predicted: [ 0.95339334], target predicted: [ 0.05351947]
should be 1.0, predicted: [ 0.81940889], target predicted: [-0.31352538]
should be 1.0, predicted: [ 1.12480187], target predicted: [ 0.08777861]
should be 1.0, predicted: [ 0.46177876], target predicted: [-0.18873093]
start searching new action
get new action:  0.25044703483581543
start getting new_q
get new_q:  0.08792424201965332
train start:
critic loss [0.088411316, 0.065009162]
critic loss [0.084309995, 0.064563841]
critic loss [0.077912517, 0.063409388]
critic loss [0.070801362, 0.061733682]
critic loss [0.064191252, 0.059663631]
train end 0.4184684753417969
should be 1.0, predicted:

get new action:  0.2420058250427246
start getting new_q
get new_q:  0.07201385498046875
train start:
critic loss [0.10000652, 0.076022163]
critic loss [0.094727822, 0.073617868]
critic loss [0.087038122, 0.069943614]
critic loss [0.078107178, 0.065467745]
critic loss [0.069052108, 0.060646452]
train end 0.5760223865509033
should be -1.0, predicted: [-0.72396785], target predicted: [-0.4527595]
should be 1.0, predicted: [ 1.08555901], target predicted: [-0.28815243]
should be 1.0, predicted: [ 0.85882527], target predicted: [-0.04313205]
should be 1.0, predicted: [ 0.93588871], target predicted: [ 0.05912279]
start searching new action
get new action:  0.23170065879821777
start getting new_q
get new_q:  0.061273813247680664
train start:
critic loss [0.20479622, 0.073215045]
critic loss [0.18689044, 0.07169231]
critic loss [0.16218464, 0.06973438]
critic loss [0.13602474, 0.068005472]
critic loss [0.1126186, 0.06690044]
train end 0.5589230060577393
should be 1.0, predicted: [ 0.88855481]

get new action:  0.23479318618774414
start getting new_q
get new_q:  0.0637056827545166
train start:
critic loss [0.23438261, 0.086027302]
critic loss [0.21741463, 0.083422683]
critic loss [0.18867412, 0.079920813]
critic loss [0.15527099, 0.076340742]
critic loss [0.12352736, 0.073297173]
train end 0.44980549812316895
should be -1.0, predicted: [-0.49057618], target predicted: [-0.24052615]
should be -1.0, predicted: [-0.61404026], target predicted: [-0.24493234]
should be -1.0, predicted: [-0.70301783], target predicted: [-0.20938958]
start searching new action
get new action:  0.24968266487121582
start getting new_q
get new_q:  0.06851983070373535
train start:
critic loss [0.097253188, 0.089452937]
critic loss [0.096421584, 0.088503852]
critic loss [0.091488622, 0.084446147]
critic loss [0.084072232, 0.078562289]
critic loss [0.076034173, 0.072284177]
train end 0.41129016876220703
should be 1.0, predicted: [ 0.74335754], target predicted: [ 0.10323823]
start searching new action
get

critic loss [0.13056591, 0.064459167]
critic loss [0.11352079, 0.061713401]
critic loss [0.094293915, 0.058586612]
critic loss [0.076607123, 0.055629097]
train end 0.4650411605834961
should be -1.0, predicted: [-0.76591551], target predicted: [-0.27284881]
should be 1.0, predicted: [ 0.65655196], target predicted: [-0.36374474]
should be 1.0, predicted: [ 0.99433672], target predicted: [ 0.12971129]
should be 1.0, predicted: [ 0.70948601], target predicted: [ 0.18761367]
should be -1.0, predicted: [-0.84602833], target predicted: [ 0.04222648]
should be 1.0, predicted: [ 0.86589575], target predicted: [-0.47835416]
start searching new action
get new action:  0.2340097427368164
start getting new_q
get new_q:  0.06447935104370117
train start:
critic loss [0.082608163, 0.057120901]
critic loss [0.079368196, 0.057407878]
critic loss [0.073712103, 0.057203274]
critic loss [0.067300566, 0.056705233]
critic loss [0.061687563, 0.056047868]
train end 0.4287395477294922
should be 1.0, predicted:

Episode : 50 Replay Buffer 4874
TOTAL REWARD @ 50-th Episode  : Reward -1
Total Step: 114
start searching new action
get new action:  0.23965144157409668
start getting new_q
get new_q:  0.06741046905517578
train start:
critic loss [0.09845902, 0.059230223]
critic loss [0.094845608, 0.060053155]
critic loss [0.087859288, 0.059736256]
critic loss [0.078867927, 0.058384895]
critic loss [0.069501005, 0.056348659]
train end 0.3878672122955322
should be 1.0, predicted: [ 1.06258249], target predicted: [-0.22377974]
should be 1.0, predicted: [ 0.976098], target predicted: [-0.02291447]
should be 1.0, predicted: [ 0.98334265], target predicted: [-0.09885813]
should be -1.0, predicted: [-0.4393118], target predicted: [-0.12099843]
start searching new action
get new action:  0.24149584770202637
start getting new_q
get new_q:  0.06999444961547852
train start:
critic loss [0.1035394, 0.059775397]
critic loss [0.10099329, 0.058847286]
critic loss [0.092792824, 0.056833141]
critic loss [0.081092224,

get new action:  0.23499774932861328
start getting new_q
get new_q:  0.06604337692260742
train start:
critic loss [0.16526334, 0.072719187]
critic loss [0.15415508, 0.071215011]
critic loss [0.13504644, 0.068917044]
critic loss [0.11325313, 0.066680253]
critic loss [0.093405083, 0.065187007]
train end 0.41240835189819336
should be -1.0, predicted: [-0.99908102], target predicted: [-0.34624827]
should be 1.0, predicted: [ 0.65354496], target predicted: [ 0.10065363]
should be -1.0, predicted: [-0.54494554], target predicted: [ 0.188587]
should be 1.0, predicted: [ 0.62872171], target predicted: [ 0.17763163]
Episode : 52 Replay Buffer 5041
TOTAL REWARD @ 52-th Episode  : Reward 1
Total Step: 155
start searching new action
get new action:  0.2508676052093506
start getting new_q
get new_q:  0.06763195991516113
train start:
critic loss [0.058418065, 0.058418065]
critic loss [0.059205417, 0.059205417]
critic loss [0.05906608, 0.05906608]
critic loss [0.057883624, 0.057883624]
critic loss [0

critic loss [0.052446134, 0.051397923]
train end 0.7173998355865479
should be 1.0, predicted: [ 0.95475549], target predicted: [ 0.07145306]
should be 1.0, predicted: [ 1.01224065], target predicted: [ 0.27321357]
start searching new action
get new action:  0.29646801948547363
start getting new_q
get new_q:  0.08272314071655273
train start:
critic loss [0.083482891, 0.062877834]
critic loss [0.079558603, 0.060966045]
critic loss [0.073178813, 0.057926469]
critic loss [0.065550603, 0.054206051]
critic loss [0.057795465, 0.05024197]
train end 0.5297236442565918
should be 1.0, predicted: [ 0.88782561], target predicted: [ 0.14753729]
should be 1.0, predicted: [ 0.90575802], target predicted: [ 0.08414987]
should be 1.0, predicted: [ 0.74923229], target predicted: [-0.25872013]
should be 1.0, predicted: [ 0.8723737], target predicted: [ 0.17973697]
start searching new action
get new action:  0.24874138832092285
start getting new_q
get new_q:  0.070709228515625
train start:
critic loss [0.0

get new action:  0.2296748161315918
start getting new_q
get new_q:  0.06300687789916992
train start:
critic loss [0.07464987, 0.056599088]
critic loss [0.071887188, 0.055237353]
critic loss [0.067442, 0.053411521]
critic loss [0.062121958, 0.051363479]
critic loss [0.056733835, 0.049310181]
train end 0.4159066677093506
should be 1.0, predicted: [ 0.87528843], target predicted: [ 0.06915059]
should be -1.0, predicted: [-0.76007861], target predicted: [-0.20110111]
start searching new action
get new action:  0.22467803955078125
start getting new_q
get new_q:  0.06775403022766113
train start:
critic loss [0.1105244, 0.064771488]
critic loss [0.10602254, 0.063561223]
critic loss [0.096753322, 0.061580509]
critic loss [0.084997505, 0.059197173]
critic loss [0.073278017, 0.056821853]
train end 0.4109935760498047
should be 1.0, predicted: [ 0.61891311], target predicted: [-0.02808217]
should be 1.0, predicted: [ 0.63370496], target predicted: [-0.31625482]
start searching new action
get new a

critic loss [0.1270397, 0.059704773]
critic loss [0.11225419, 0.057554152]
critic loss [0.0946256, 0.054921895]
critic loss [0.077363431, 0.052204877]
train end 0.4434680938720703
should be 1.0, predicted: [ 0.86927044], target predicted: [ 0.34789589]
should be -1.0, predicted: [-0.3648721], target predicted: [-0.12451643]
should be 1.0, predicted: [ 0.7029388], target predicted: [-0.1201261]
start searching new action
get new action:  0.2457575798034668
start getting new_q
get new_q:  0.06641173362731934
train start:
critic loss [0.10989143, 0.058597729]
critic loss [0.10415354, 0.05765605]
critic loss [0.092791855, 0.05562105]
critic loss [0.079133555, 0.053159803]
critic loss [0.06632226, 0.050930031]
train end 0.41616225242614746
should be 1.0, predicted: [ 0.62225175], target predicted: [ 0.05018965]
should be 1.0, predicted: [ 0.67809266], target predicted: [ 0.33033049]
should be 1.0, predicted: [ 0.97996289], target predicted: [ 0.21961942]
should be 1.0, predicted: [ 0.683027

get new action:  0.25402331352233887
start getting new_q
get new_q:  0.07609963417053223
train start:
critic loss [0.064672962, 0.057985086]
critic loss [0.062462516, 0.057465397]
critic loss [0.059439674, 0.056048602]
critic loss [0.05585577, 0.053768203]
critic loss [0.051987283, 0.050795577]
train end 0.5570826530456543
should be 1.0, predicted: [ 0.84925711], target predicted: [ 0.14101559]
should be 1.0, predicted: [ 1.05344045], target predicted: [-0.09238594]
should be 1.0, predicted: [ 0.83415842], target predicted: [ 0.25407323]
start searching new action
get new action:  0.23549199104309082
start getting new_q
get new_q:  0.07663464546203613
train start:
critic loss [0.13001609, 0.063679084]
critic loss [0.12307055, 0.061347835]
critic loss [0.11159457, 0.058462914]
critic loss [0.097860828, 0.055462301]
critic loss [0.083882354, 0.052657001]
train end 0.46154212951660156
should be 1.0, predicted: [ 0.73111272], target predicted: [ 0.4241758]
should be 1.0, predicted: [ 0.608

get new action:  0.2330777645111084
start getting new_q
get new_q:  0.06828784942626953
train start:
critic loss [0.093278587, 0.062482886]
critic loss [0.091140613, 0.064445995]
critic loss [0.08730495, 0.066280484]
critic loss [0.082524493, 0.067543224]
critic loss [0.077362791, 0.067881227]
train end 0.4448375701904297
should be 1.0, predicted: [ 0.5608058], target predicted: [-0.3650941]
start searching new action
get new action:  0.22788286209106445
start getting new_q
get new_q:  0.06632232666015625
train start:
critic loss [0.082287401, 0.071636736]
critic loss [0.080708638, 0.072029628]
critic loss [0.077283815, 0.070872836]
critic loss [0.072758764, 0.06849657]
critic loss [0.067828305, 0.065281704]
train end 0.39193010330200195
should be 1.0, predicted: [ 0.79829109], target predicted: [ 0.02702785]
should be 1.0, predicted: [ 0.87214577], target predicted: [ 0.0175119]
should be 1.0, predicted: [ 0.72836047], target predicted: [ 0.22977298]
start searching new action
get new

critic loss [0.073394142, 0.073394142]
critic loss [0.071447983, 0.071447983]
critic loss [0.068799108, 0.068799108]
critic loss [0.06557788, 0.06557788]
train end 0.48092150688171387
start searching new action
get new action:  0.27234554290771484
start getting new_q
get new_q:  0.0688021183013916
train start:
critic loss [0.097479731, 0.068112388]
critic loss [0.09398593, 0.066455126]
critic loss [0.088261798, 0.06434755]
critic loss [0.081185244, 0.061954364]
critic loss [0.073937148, 0.059454985]
train end 0.40465354919433594
should be 1.0, predicted: [ 0.50955975], target predicted: [-0.25484571]
start searching new action
get new action:  0.22365808486938477
start getting new_q
get new_q:  0.06644487380981445
train start:
critic loss [0.090616398, 0.068212233]
critic loss [0.084696688, 0.065783873]
critic loss [0.075513914, 0.061872248]
critic loss [0.06574861, 0.057553995]
critic loss [0.057720795, 0.053796694]
train end 0.4160771369934082
should be 1.0, predicted: [ 0.81520414],

get new action:  0.2617819309234619
start getting new_q
get new_q:  0.07629013061523438
train start:
critic loss [0.051578458, 0.051578458]
critic loss [0.053644322, 0.053644322]
critic loss [0.055192348, 0.055192348]
critic loss [0.055756386, 0.055756386]
critic loss [0.055169709, 0.055169709]
train end 0.6591503620147705
start searching new action
get new action:  0.29305291175842285
start getting new_q
get new_q:  0.13931703567504883
train start:
critic loss [0.068847358, 0.065717205]
critic loss [0.066666588, 0.064050518]
critic loss [0.063069969, 0.061197158]
critic loss [0.058886647, 0.057779208]
critic loss [0.054813143, 0.054320533]
train end 0.7148730754852295
should be 1.0, predicted: [ 0.9145425], target predicted: [ 0.24806023]
start searching new action
get new action:  0.24155092239379883
start getting new_q
get new_q:  0.05969119071960449
train start:
critic loss [0.134286, 0.070026167]
critic loss [0.12709902, 0.069746681]
critic loss [0.11720937, 0.06905023]
critic los

get new action:  0.23383688926696777
start getting new_q
get new_q:  0.06337356567382812
train start:
critic loss [0.12305236, 0.068179987]
critic loss [0.12284306, 0.070779607]
critic loss [0.11404739, 0.070254333]
critic loss [0.098890357, 0.066693045]
critic loss [0.081149794, 0.06116759]
train end 0.4042999744415283
should be 1.0, predicted: [ 0.86149752], target predicted: [ 0.21332958]
should be 1.0, predicted: [ 0.79924607], target predicted: [ 0.37158045]
should be -1.0, predicted: [-0.56476152], target predicted: [-0.17230669]
should be 1.0, predicted: [ 0.83869135], target predicted: [ 0.14971468]
should be 1.0, predicted: [ 0.84426963], target predicted: [ 0.21736197]
start searching new action
get new action:  0.23721003532409668
start getting new_q
get new_q:  0.0673527717590332
train start:
critic loss [0.091792941, 0.058124639]
critic loss [0.089514807, 0.056680687]
critic loss [0.084967583, 0.055355169]
critic loss [0.078933924, 0.054171525]
critic loss [0.072167963, 0.

start searching new action
get new action:  0.2551455497741699
start getting new_q
get new_q:  0.06651806831359863
train start:
critic loss [0.070836157, 0.070836157]
critic loss [0.068095699, 0.068095699]
critic loss [0.064272977, 0.064272977]
critic loss [0.059952337, 0.059952337]
critic loss [0.055632882, 0.055632882]
train end 0.4718968868255615
start searching new action
get new action:  0.2387073040008545
start getting new_q
get new_q:  0.06843256950378418
train start:
critic loss [0.15889922, 0.059670012]
critic loss [0.1398655, 0.057434827]
critic loss [0.11680886, 0.055700503]
critic loss [0.095626906, 0.055191904]
critic loss [0.079842344, 0.055826753]
train end 0.42880868911743164
should be 1.0, predicted: [ 0.49417284], target predicted: [ 0.20892383]
should be -1.0, predicted: [-1.4729141], target predicted: [-0.44400197]
should be -1.0, predicted: [-0.82333791], target predicted: [-0.1348272]
should be 1.0, predicted: [ 0.57508218], target predicted: [ 0.22733951]
should 

critic loss [0.073798858, 0.058686964]
critic loss [0.068156287, 0.05542345]
critic loss [0.061814927, 0.052011319]
critic loss [0.055627964, 0.048768684]
train end 0.4104194641113281
should be 1.0, predicted: [ 0.54906958], target predicted: [ 0.05922532]
should be 1.0, predicted: [ 0.84169996], target predicted: [ 0.17626616]
start searching new action
get new action:  0.24096965789794922
start getting new_q
get new_q:  0.06978988647460938
train start:
critic loss [0.053218074, 0.051642306]
critic loss [0.051811893, 0.050379053]
critic loss [0.050086051, 0.048891053]
critic loss [0.048127949, 0.04721339]
critic loss [0.046010561, 0.045369111]
train end 0.421771764755249
should be 1.0, predicted: [ 0.72508901], target predicted: [ 0.59935737]
trained action prob map predicted by initial model for a starting game
[[[  4.74112213e-01   2.26921802e-05   2.26965112e-05   2.27041946e-05
     2.27350065e-05   2.27338332e-05   2.27408400e-05]
  [  2.26640877e-05   2.26761294e-05   2.26807369

critic loss [0.071352541, 0.063728258]
train end 0.4040369987487793
should be 1.0, predicted: [ 0.90161312], target predicted: [ 0.39280182]
should be 1.0, predicted: [ 0.57771301], target predicted: [ 0.18830207]
start searching new action
get new action:  0.2404167652130127
start getting new_q
get new_q:  0.060343265533447266
train start:
critic loss [0.079543233, 0.063057475]
critic loss [0.076464787, 0.060365066]
critic loss [0.072574027, 0.057900354]
critic loss [0.068126239, 0.055664349]
critic loss [0.063417733, 0.053607218]
train end 0.40976405143737793
should be -1.0, predicted: [-0.56744325], target predicted: [-0.45185956]
start searching new action
get new action:  0.2258591651916504
start getting new_q
get new_q:  0.0668942928314209
train start:
critic loss [0.053784121, 0.053763974]
critic loss [0.052803703, 0.052799843]
critic loss [0.051548034, 0.05154632]
critic loss [0.050012458, 0.049987424]
critic loss [0.048231643, 0.04815428]
train end 0.3949761390686035
should be

get new action:  0.23777270317077637
start getting new_q
get new_q:  0.07136774063110352
train start:
critic loss [0.099864312, 0.083998337]
critic loss [0.097427048, 0.082207426]
critic loss [0.092089862, 0.078956157]
critic loss [0.084590323, 0.07452555]
critic loss [0.076112643, 0.069401294]
train end 0.4142317771911621
should be 1.0, predicted: [ 0.91396463], target predicted: [ 0.3746044]
should be 1.0, predicted: [ 0.6129564], target predicted: [ 0.42512721]
start searching new action
get new action:  0.23221087455749512
start getting new_q
get new_q:  0.06227684020996094
train start:
critic loss [0.10073809, 0.064028718]
critic loss [0.09628097, 0.061990961]
critic loss [0.088687167, 0.059645388]
critic loss [0.079395287, 0.057204556]
critic loss [0.069894604, 0.054854602]
train end 0.41213226318359375
should be 1.0, predicted: [ 0.82424986], target predicted: [ 0.30161092]
should be 1.0, predicted: [ 1.10349166], target predicted: [ 0.41178018]
should be 1.0, predicted: [ 0.642

critic loss [0.081184268, 0.05991473]
critic loss [0.073878892, 0.058250885]
critic loss [0.066765174, 0.056504924]
critic loss [0.060393706, 0.054561436]
train end 0.5458199977874756
should be -1.0, predicted: [-0.87183994], target predicted: [-0.235778]
Episode : 75 Replay Buffer 7118
TOTAL REWARD @ 75-th Episode  : Reward -1
Total Step: 17
start searching new action
get new action:  0.24472975730895996
start getting new_q
get new_q:  0.06890726089477539
train start:
critic loss [0.066945225, 0.052348316]
critic loss [0.06706208, 0.052446898]
critic loss [0.0656773, 0.051993906]
critic loss [0.062753916, 0.05090297]
critic loss [0.05861821, 0.049231209]
train end 0.45364856719970703
should be 1.0, predicted: [ 1.02712727], target predicted: [ 0.45563459]
should be 1.0, predicted: [ 0.73851782], target predicted: [ 0.54073936]
should be 1.0, predicted: [ 0.87008929], target predicted: [ 0.41798639]
should be -1.0, predicted: [-0.86620718], target predicted: [-0.47836959]
should be 1.0

critic loss [0.08122918, 0.043137133]
critic loss [0.072950423, 0.041710854]
critic loss [0.063631728, 0.040175553]
critic loss [0.054585766, 0.038697064]
train end 0.49213266372680664
should be 1.0, predicted: [ 0.76761907], target predicted: [ 0.480674]
should be -1.0, predicted: [-0.70852], target predicted: [-0.57968205]
should be 1.0, predicted: [ 0.47674426], target predicted: [-0.07410337]
start searching new action
get new action:  0.23892450332641602
start getting new_q
get new_q:  0.06312155723571777
train start:
critic loss [0.083330721, 0.059629187]
critic loss [0.081594318, 0.059780899]
critic loss [0.0780572, 0.059122391]
critic loss [0.073359795, 0.057771876]
critic loss [0.067921937, 0.055867951]
train end 0.43159914016723633
should be 1.0, predicted: [ 1.00017369], target predicted: [ 0.41325194]
should be 1.0, predicted: [ 0.62986624], target predicted: [ 0.21551457]
should be 1.0, predicted: [ 0.7931127], target predicted: [ 0.30446178]
should be -1.0, predicted: [-0

get new action:  0.24385857582092285
start getting new_q
get new_q:  0.06462454795837402
train start:
critic loss [0.073996231, 0.051278956]
critic loss [0.071557879, 0.050060973]
critic loss [0.067097694, 0.048368484]
critic loss [0.06138337, 0.046338823]
critic loss [0.055203922, 0.044125408]
train end 0.4241511821746826
should be 1.0, predicted: [ 0.65416092], target predicted: [ 0.4805969]
should be 1.0, predicted: [ 0.80111253], target predicted: [ 0.52694207]
should be 1.0, predicted: [ 0.69601375], target predicted: [ 0.23168869]
start searching new action
get new action:  0.23361563682556152
start getting new_q
get new_q:  0.06385970115661621
train start:
critic loss [0.16905685, 0.05950461]
critic loss [0.15885556, 0.057784539]
critic loss [0.14150196, 0.054629169]
critic loss [0.12071188, 0.050945826]
critic loss [0.09997201, 0.047576658]
train end 0.39796924591064453
should be -1.0, predicted: [-0.55300039], target predicted: [-0.14750968]
should be 1.0, predicted: [ 1.14248

get new action:  0.2486732006072998
start getting new_q
get new_q:  0.06952214241027832
train start:
critic loss [0.087132409, 0.060734265]
critic loss [0.084082693, 0.06003882]
critic loss [0.077043116, 0.057239272]
critic loss [0.067941189, 0.053194068]
critic loss [0.058799855, 0.048866302]
train end 0.40265989303588867
should be 1.0, predicted: [ 0.50467563], target predicted: [ 0.19403318]
should be -1.0, predicted: [-0.74931711], target predicted: [-0.52421808]
should be 1.0, predicted: [ 0.81091326], target predicted: [ 0.21647923]
start searching new action
get new action:  0.23627257347106934
start getting new_q
get new_q:  0.06387758255004883
train start:
critic loss [0.048235536, 0.048048262]
critic loss [0.046931356, 0.046879649]
critic loss [0.046135828, 0.046063833]
critic loss [0.045654602, 0.045453951]
critic loss [0.045137696, 0.044781629]
train end 0.39905643463134766
should be 1.0, predicted: [ 0.91905296], target predicted: [ 0.51581478]
should be 1.0, predicted: [ 

get new action:  0.2458035945892334
start getting new_q
get new_q:  0.0733487606048584
train start:
critic loss [0.066600755, 0.047553107]
critic loss [0.064669967, 0.047625348]
critic loss [0.061012413, 0.047036853]
critic loss [0.056240581, 0.045844365]
critic loss [0.051112525, 0.044211421]
train end 0.6008381843566895
should be 1.0, predicted: [ 0.80022889], target predicted: [ 0.43218076]
should be 1.0, predicted: [ 0.81346452], target predicted: [ 0.44848165]
start searching new action
get new action:  0.24181699752807617
start getting new_q
get new_q:  0.06612181663513184
train start:
critic loss [0.046607792, 0.044144787]
critic loss [0.045698009, 0.043634802]
critic loss [0.043991812, 0.042419128]
critic loss [0.041841924, 0.040728442]
critic loss [0.039563119, 0.038790748]
train end 0.47978830337524414
should be 1.0, predicted: [ 0.81098312], target predicted: [ 0.36908677]
should be 1.0, predicted: [ 1.13145936], target predicted: [ 0.67439562]
start searching new action
get

get new action:  0.23923134803771973
start getting new_q
get new_q:  0.06681537628173828
train start:
critic loss [0.067691773, 0.05579859]
critic loss [0.066524133, 0.055465307]
critic loss [0.064052552, 0.05446431]
critic loss [0.060657892, 0.052880928]
critic loss [0.056697093, 0.050812211]
train end 0.43685483932495117
should be 1.0, predicted: [ 0.70162714], target predicted: [ 0.04883994]
start searching new action
get new action:  0.24045109748840332
start getting new_q
get new_q:  0.06477975845336914
train start:
critic loss [0.12368596, 0.05603594]
critic loss [0.11575811, 0.053673297]
critic loss [0.10202226, 0.050140686]
critic loss [0.085601047, 0.04621727]
critic loss [0.069662213, 0.042634811]
train end 0.5781173706054688
should be 1.0, predicted: [ 0.65892041], target predicted: [ 0.38386545]
should be 1.0, predicted: [ 0.54552889], target predicted: [ 0.03874252]
should be -1.0, predicted: [-0.56011719], target predicted: [-0.40369123]
start searching new action
get new

get new action:  0.25823283195495605
start getting new_q
get new_q:  0.07078361511230469
train start:
critic loss [0.056397751, 0.055586755]
critic loss [0.056378271, 0.055503782]
critic loss [0.055348486, 0.054554682]
critic loss [0.053345419, 0.052742727]
critic loss [0.050591923, 0.050223269]
train end 0.5738205909729004
should be 1.0, predicted: [ 0.78944755], target predicted: [ 0.47699088]
Episode : 86 Replay Buffer 8038
TOTAL REWARD @ 86-th Episode  : Reward 1
Total Step: 155
start searching new action
get new action:  0.2593238353729248
start getting new_q
get new_q:  0.07161402702331543
train start:
critic loss [0.1105182, 0.06220375]
critic loss [0.10326965, 0.060311697]
critic loss [0.092435956, 0.057792693]
critic loss [0.081042394, 0.055329029]
critic loss [0.070181862, 0.053226747]
train end 0.4969062805175781
should be 1.0, predicted: [ 0.48210996], target predicted: [-0.03994724]
should be -1.0, predicted: [-0.67868483], target predicted: [-0.56764799]
start searching n

critic loss [0.058777664, 0.047090225]
critic loss [0.05675108, 0.047065966]
critic loss [0.053804655, 0.046557292]
critic loss [0.050469693, 0.045631647]
train end 0.49695730209350586
should be 1.0, predicted: [ 0.71321768], target predicted: [ 0.2591154]
should be 1.0, predicted: [ 0.98681843], target predicted: [ 0.62305295]
start searching new action
get new action:  0.2661924362182617
start getting new_q
get new_q:  0.08006739616394043
train start:
critic loss [0.059189282, 0.057112299]
critic loss [0.057776757, 0.056073349]
critic loss [0.055672538, 0.054445453]
critic loss [0.05319757, 0.052441511]
critic loss [0.050631799, 0.050258622]
train end 0.5394277572631836
should be -1.0, predicted: [-0.95844233], target predicted: [-0.44823274]
start searching new action
get new action:  0.23918771743774414
start getting new_q
get new_q:  0.0672905445098877
train start:
critic loss [0.13964733, 0.056500636]
critic loss [0.13018, 0.053722724]
critic loss [0.11345841, 0.049611878]
critic

critic loss [0.082356125, 0.058557346]
train end 0.4114670753479004
should be 1.0, predicted: [ 0.63225025], target predicted: [ 0.41573587]
should be 1.0, predicted: [ 0.99707776], target predicted: [ 0.54803228]
should be 1.0, predicted: [ 0.66458106], target predicted: [ 0.20456828]
should be 1.0, predicted: [ 0.95693713], target predicted: [ 0.43033931]
start searching new action
get new action:  0.260300874710083
start getting new_q
get new_q:  0.07282614707946777
train start:
critic loss [0.091133371, 0.063204661]
critic loss [0.094660968, 0.06562867]
critic loss [0.091399565, 0.064325131]
critic loss [0.082482524, 0.059860203]
critic loss [0.07056082, 0.053693768]
train end 0.42197346687316895
should be -1.0, predicted: [-0.59013224], target predicted: [-0.53742933]
should be 1.0, predicted: [ 0.68116766], target predicted: [ 0.38963604]
should be 1.0, predicted: [ 0.83336914], target predicted: [ 0.40610757]
should be 1.0, predicted: [ 0.7729035], target predicted: [ 0.61814988

get new action:  0.2547619342803955
start getting new_q
get new_q:  0.07408380508422852
train start:
critic loss [0.15443611, 0.060639665]
critic loss [0.1460395, 0.059277661]
critic loss [0.12952612, 0.056542188]
critic loss [0.10925517, 0.053334765]
critic loss [0.088979758, 0.050495483]
train end 0.48644161224365234
should be 1.0, predicted: [ 0.69221693], target predicted: [ 0.47874007]
should be -1.0, predicted: [-0.67947847], target predicted: [-0.41267857]
should be 1.0, predicted: [ 0.42745093], target predicted: [ 0.04005326]
should be 1.0, predicted: [ 0.69795942], target predicted: [ 0.26206523]
should be 1.0, predicted: [ 0.73505187], target predicted: [ 0.30623686]
start searching new action
get new action:  0.23600077629089355
start getting new_q
get new_q:  0.07294940948486328
train start:
critic loss [0.051388059, 0.047683723]
critic loss [0.052797727, 0.049060337]
critic loss [0.053336538, 0.049932107]
critic loss [0.052870397, 0.05006564]
critic loss [0.051451284, 0.0

get new action:  0.2308640480041504
start getting new_q
get new_q:  0.07137274742126465
train start:
critic loss [0.13243023, 0.067444973]
critic loss [0.12618411, 0.065645114]
critic loss [0.11454901, 0.063155308]
critic loss [0.099984489, 0.060261708]
critic loss [0.085280932, 0.057246231]
train end 0.508634090423584
should be 1.0, predicted: [ 0.78579652], target predicted: [ 0.37013102]
should be -1.0, predicted: [-0.7400974], target predicted: [-0.18594396]
should be 1.0, predicted: [ 0.93869036], target predicted: [ 0.80529958]
should be 1.0, predicted: [ 0.56230277], target predicted: [ 0.13525429]
should be 1.0, predicted: [ 0.80179006], target predicted: [ 0.38732293]
start searching new action
get new action:  0.3120708465576172
start getting new_q
get new_q:  0.10102343559265137
train start:
critic loss [0.12708205, 0.067514926]
critic loss [0.11948428, 0.064620838]
critic loss [0.10613827, 0.059960745]
critic loss [0.090124696, 0.054591604]
critic loss [0.074501149, 0.04953

critic loss [0.060627177, 0.048348635]
critic loss [0.055322189, 0.045500249]
critic loss [0.050177142, 0.043095738]
critic loss [0.045929782, 0.041397575]
train end 0.40697455406188965
should be 1.0, predicted: [ 0.72349948], target predicted: [ 0.37028483]
should be 1.0, predicted: [ 0.88131511], target predicted: [ 0.46312609]
start searching new action
get new action:  0.24285149574279785
start getting new_q
get new_q:  0.07829475402832031
train start:
critic loss [0.056336325, 0.052440971]
critic loss [0.056546099, 0.052525625]
critic loss [0.055617191, 0.051878843]
critic loss [0.053514294, 0.050384887]
critic loss [0.050468974, 0.048135936]
train end 0.5237224102020264
should be 1.0, predicted: [ 0.78946555], target predicted: [ 0.70434731]
start searching new action
get new action:  0.24516081809997559
start getting new_q
get new_q:  0.06711888313293457
train start:
critic loss [0.085420698, 0.052829694]
critic loss [0.07867559, 0.04933013]
critic loss [0.069445461, 0.045077227

get new action:  0.2287459373474121
start getting new_q
get new_q:  0.06414175033569336
train start:
critic loss [0.071244627, 0.048295636]
critic loss [0.069370478, 0.047678892]
critic loss [0.065696634, 0.046653807]
critic loss [0.060904339, 0.045323305]
critic loss [0.055800401, 0.043810017]
train end 0.4249591827392578
should be -1.0, predicted: [-0.69249183], target predicted: [-0.53046161]
should be 1.0, predicted: [ 0.70516491], target predicted: [ 0.40523735]
Episode : 97 Replay Buffer 9351
TOTAL REWARD @ 97-th Episode  : Reward 1
Total Step: 126
start searching new action
get new action:  0.253903865814209
start getting new_q
get new_q:  0.06962442398071289
train start:
critic loss [0.075807102, 0.055704616]
critic loss [0.074359246, 0.054842729]
critic loss [0.071224809, 0.053460624]
critic loss [0.066802941, 0.051621817]
critic loss [0.061614841, 0.049457837]
train end 0.4399242401123047
should be 1.0, predicted: [ 0.49799535], target predicted: [ 0.20589057]
start searching

get new action:  0.22815704345703125
start getting new_q
get new_q:  0.07197761535644531
train start:
critic loss [0.085198469, 0.057327352]
critic loss [0.080571175, 0.057015743]
critic loss [0.074448638, 0.055929489]
critic loss [0.067579538, 0.054078475]
critic loss [0.060687818, 0.051678985]
train end 0.4086487293243408
should be 1.0, predicted: [ 1.19987905], target predicted: [ 0.63375211]
should be 1.0, predicted: [ 0.48794782], target predicted: [ 0.18955474]
start searching new action
get new action:  0.22601008415222168
start getting new_q
get new_q:  0.07023906707763672
train start:
critic loss [0.058570497, 0.058075294]
critic loss [0.057681784, 0.057173111]
critic loss [0.056600519, 0.056077033]
critic loss [0.055408843, 0.054875016]
critic loss [0.054131851, 0.05360264]
train end 0.41626596450805664
should be 1.0, predicted: [ 0.77253956], target predicted: [ 0.53306502]
Episode : 99 Replay Buffer 9544
TOTAL REWARD @ 99-th Episode  : Reward 1
Total Step: 130
start searchi

get new action:  0.2543768882751465
start getting new_q
get new_q:  0.0690004825592041
train start:
critic loss [0.077254042, 0.047017872]
critic loss [0.074111789, 0.046282113]
critic loss [0.068279266, 0.044857293]
critic loss [0.0609538, 0.043025561]
critic loss [0.053450625, 0.041140884]
train end 0.552375316619873
should be 1.0, predicted: [ 0.93843597], target predicted: [ 0.5609026]
should be -1.0, predicted: [-0.7767877], target predicted: [-0.5036245]
should be 1.0, predicted: [ 0.69949424], target predicted: [ 0.27591148]
start searching new action
get new action:  0.2500455379486084
start getting new_q
get new_q:  0.06935429573059082
train start:
critic loss [0.13369963, 0.055986769]
critic loss [0.1279148, 0.056195628]
critic loss [0.11701937, 0.055394206]
critic loss [0.10333734, 0.053908981]
critic loss [0.089110903, 0.052100398]
train end 0.42956089973449707
should be 1.0, predicted: [ 0.77828795], target predicted: [ 0.63757461]
should be -1.0, predicted: [-0.42818609],

Episode : 102 Replay Buffer 9924
TOTAL REWARD @ 102-th Episode  : Reward 1
Total Step: 112
start searching new action
get new action:  0.2413928508758545
start getting new_q
get new_q:  0.06662464141845703
train start:
critic loss [0.06188738, 0.050742462]
critic loss [0.061118584, 0.049535599]
critic loss [0.05943656, 0.048202448]
critic loss [0.056842249, 0.0466277]
critic loss [0.053351831, 0.044731162]
train end 0.4863169193267822
should be 1.0, predicted: [ 0.94807112], target predicted: [ 0.80077165]
should be 1.0, predicted: [ 0.94148642], target predicted: [ 0.61779171]
should be 1.0, predicted: [ 0.51057839], target predicted: [ 0.47990677]
start searching new action
get new action:  0.2655322551727295
start getting new_q
get new_q:  0.07147622108459473
train start:
critic loss [0.076542616, 0.046045601]
critic loss [0.074447967, 0.045385424]
critic loss [0.069144636, 0.044211462]
critic loss [0.062100627, 0.042768512]
critic loss [0.0547265, 0.04129789]
train end 0.5349659919

should be 1.0, predicted: [ 0.94441342], target predicted: [ 0.28048366]
start searching new action
get new action:  0.27335238456726074
start getting new_q
get new_q:  0.07636356353759766
train start:
critic loss [0.064711161, 0.063885748]
critic loss [0.067279279, 0.066469997]
critic loss [0.067182273, 0.066594794]
critic loss [0.064786136, 0.064503446]
critic loss [0.060814105, 0.060761899]
train end 0.5024290084838867
should be 1.0, predicted: [ 1.15090251], target predicted: [ 0.92227578]
start searching new action
get new action:  0.24546051025390625
start getting new_q
get new_q:  0.06313133239746094
train start:
critic loss [0.084420547, 0.072149672]
critic loss [0.079557434, 0.067534491]
critic loss [0.073561825, 0.062484518]
critic loss [0.067233145, 0.057578955]
critic loss [0.061190337, 0.053219765]
train end 0.40894007682800293
should be 1.0, predicted: [ 0.73658097], target predicted: [ 0.45471367]
should be 1.0, predicted: [ 0.91839111], target predicted: [ 0.75464541]
E

get new action:  0.2532026767730713
start getting new_q
get new_q:  0.06884455680847168
train start:
critic loss [0.098365307, 0.052787006]
critic loss [0.095112503, 0.051913925]
critic loss [0.087727971, 0.050330177]
critic loss [0.077879667, 0.048356313]
critic loss [0.067378402, 0.046346046]
train end 0.45312047004699707
should be 1.0, predicted: [ 0.563806], target predicted: [ 0.51829481]
should be 1.0, predicted: [ 0.64146709], target predicted: [ 0.71254903]
should be -1.0, predicted: [-0.5603528], target predicted: [-0.29415691]
start searching new action
get new action:  0.23879599571228027
start getting new_q
get new_q:  0.06798338890075684
train start:
critic loss [0.066570073, 0.046104606]
critic loss [0.064791724, 0.045699447]
critic loss [0.061175279, 0.044705827]
critic loss [0.056386277, 0.043298125]
critic loss [0.05118376, 0.041661453]
train end 0.42053866386413574
should be 1.0, predicted: [ 1.02515554], target predicted: [ 0.90322912]
should be 1.0, predicted: [ 0.6

get new action:  0.23611831665039062
start getting new_q
get new_q:  0.0731210708618164
train start:
critic loss [0.064478546, 0.042770341]
critic loss [0.061546698, 0.041758299]
critic loss [0.056539763, 0.040376768]
critic loss [0.050574042, 0.038882677]
critic loss [0.044751801, 0.037468027]
train end 0.7624344825744629
should be 1.0, predicted: [ 0.66721386], target predicted: [ 0.53470081]
should be 1.0, predicted: [ 0.69866246], target predicted: [ 0.576855]
start searching new action
get new action:  0.24129700660705566
start getting new_q
get new_q:  0.07989215850830078
train start:
critic loss [0.10049577, 0.048904158]
critic loss [0.095257163, 0.048771784]
critic loss [0.086856224, 0.048251178]
critic loss [0.077168703, 0.047540452]
critic loss [0.067635551, 0.046757754]
train end 0.4466226100921631
should be 1.0, predicted: [ 0.66698766], target predicted: [ 0.34783152]
should be 1.0, predicted: [ 0.52345777], target predicted: [ 0.27170828]
start searching new action
get ne

critic loss [0.045228526, 0.045059849]
train end 0.4050116539001465
should be 1.0, predicted: [ 1.03060186], target predicted: [ 0.87859631]
should be -1.0, predicted: [-0.8546856], target predicted: [-0.95922631]
start searching new action
get new action:  0.23226475715637207
start getting new_q
get new_q:  0.06842780113220215
train start:
critic loss [0.1157416, 0.060372554]
critic loss [0.10813902, 0.056603208]
critic loss [0.097638875, 0.052781694]
critic loss [0.085706398, 0.049260743]
critic loss [0.073866092, 0.046270967]
train end 0.42620253562927246
should be 1.0, predicted: [ 0.30872348], target predicted: [-0.15261164]
should be 1.0, predicted: [ 0.73732561], target predicted: [ 0.65862966]
should be -1.0, predicted: [-0.59357738], target predicted: [-0.44742405]
start searching new action
get new action:  0.2383735179901123
start getting new_q
get new_q:  0.060803890228271484
train start:
critic loss [0.064768434, 0.03959848]
critic loss [0.064266838, 0.039539982]
critic lo

critic loss [0.050835475, 0.047297992]
critic loss [0.047653705, 0.045230035]
critic loss [0.044682991, 0.043127477]
critic loss [0.042237848, 0.041182939]
train end 0.48046231269836426
should be 1.0, predicted: [ 0.82785505], target predicted: [ 0.28257075]
should be 1.0, predicted: [ 0.72516608], target predicted: [ 0.6479851]
start searching new action
get new action:  0.24467992782592773
start getting new_q
get new_q:  0.06281018257141113
train start:
critic loss [0.032536943, 0.032536943]
critic loss [0.032012694, 0.032012694]
critic loss [0.031440295, 0.031440295]
critic loss [0.030777846, 0.030777846]
critic loss [0.030006591, 0.030006591]
train end 0.42765021324157715
Episode : 111 Replay Buffer 10470
TOTAL REWARD @ 111-th Episode  : Reward -1
Total Step: 7
start searching new action
get new action:  0.23349332809448242
start getting new_q
get new_q:  0.06383061408996582
train start:
critic loss [0.059070833, 0.041019179]
critic loss [0.05679749, 0.040549703]
critic loss [0.053

get new action:  0.22618460655212402
start getting new_q
get new_q:  0.07518339157104492
train start:
critic loss [0.071297899, 0.047672667]
critic loss [0.069747053, 0.048782282]
critic loss [0.067029729, 0.049951769]
critic loss [0.06375359, 0.050992616]
critic loss [0.060373053, 0.05168391]
train end 0.6206953525543213
should be 1.0, predicted: [ 0.84193546], target predicted: [ 0.27882516]
should be 1.0, predicted: [ 1.05636621], target predicted: [ 0.75105309]
should be 1.0, predicted: [ 0.82643104], target predicted: [ 0.3222135]
Episode : 113 Replay Buffer 10748
TOTAL REWARD @ 113-th Episode  : Reward 1
Total Step: 153
start searching new action
get new action:  0.24682307243347168
start getting new_q
get new_q:  0.0655357837677002
train start:
critic loss [0.095702142, 0.069575116]
critic loss [0.093023889, 0.069393374]
critic loss [0.086505145, 0.06702368]
critic loss [0.077745676, 0.063198179]
critic loss [0.068459779, 0.058788568]
train end 0.5129215717315674
should be -1.0,

get new action:  0.23331999778747559
start getting new_q
get new_q:  0.07905745506286621
train start:
critic loss [0.12313356, 0.054149106]
critic loss [0.11967991, 0.05443763]
critic loss [0.11111272, 0.053903781]
critic loss [0.099140212, 0.052690014]
critic loss [0.085803173, 0.051081747]
train end 0.528923511505127
should be 1.0, predicted: [ 0.52350396], target predicted: [ 0.19798927]
should be 1.0, predicted: [ 0.85300303], target predicted: [ 0.80812311]
should be 1.0, predicted: [ 0.8469556], target predicted: [ 0.32195479]
should be -1.0, predicted: [-0.51219219], target predicted: [-0.50892186]
should be -1.0, predicted: [-0.56805193], target predicted: [-0.5858925]
start searching new action
get new action:  0.24490594863891602
start getting new_q
get new_q:  0.06803488731384277
train start:
critic loss [0.092897654, 0.055903468]
critic loss [0.086721525, 0.054788575]
critic loss [0.077969596, 0.052806206]
critic loss [0.068675749, 0.050569553]
critic loss [0.060140043, 0.0

critic loss [0.062918954, 0.046124604]
critic loss [0.060106218, 0.047302801]
critic loss [0.056487672, 0.047537405]
critic loss [0.052431613, 0.046701804]
train end 0.41434359550476074
should be 1.0, predicted: [ 0.81625462], target predicted: [ 0.45813718]
should be 1.0, predicted: [ 0.75272024], target predicted: [ 0.21209183]
should be 1.0, predicted: [ 0.86625373], target predicted: [ 0.27732787]
start searching new action
get new action:  0.24170255661010742
start getting new_q
get new_q:  0.06474804878234863
train start:
critic loss [0.072377585, 0.051590137]
critic loss [0.069188841, 0.050039425]
critic loss [0.063610539, 0.047508754]
critic loss [0.056873441, 0.044552237]
critic loss [0.050065078, 0.04165218]
train end 0.39748191833496094
should be -1.0, predicted: [-0.54674339], target predicted: [-0.47005135]
should be -1.0, predicted: [-0.85421324], target predicted: [-0.59752077]
start searching new action
get new action:  0.2274641990661621
start getting new_q
get new_q: 

get new action:  0.24532771110534668
start getting new_q
get new_q:  0.08010458946228027
train start:
critic loss [0.10874853, 0.04366453]
critic loss [0.10616753, 0.043849237]
critic loss [0.099085517, 0.043217696]
critic loss [0.088984385, 0.041981291]
critic loss [0.077462226, 0.040402405]
train end 0.774097204208374
should be -1.0, predicted: [-0.114159], target predicted: [ 0.08095933]
should be 1.0, predicted: [ 0.72085756], target predicted: [ 0.22146018]
start searching new action
get new action:  0.25209617614746094
start getting new_q
get new_q:  0.06644940376281738
train start:
critic loss [0.083200023, 0.043489605]
critic loss [0.078517325, 0.043088667]
critic loss [0.070996463, 0.042288974]
critic loss [0.062416352, 0.04145661]
critic loss [0.05438675, 0.04086832]
train end 0.4787468910217285
should be -1.0, predicted: [-0.76499748], target predicted: [-0.38165098]
should be -1.0, predicted: [-0.72723073], target predicted: [-0.27541932]
start searching new action
get new 

get new action:  0.2558443546295166
start getting new_q
get new_q:  0.07410335540771484
train start:
critic loss [0.063517548, 0.039021075]
critic loss [0.060336113, 0.038332775]
critic loss [0.055563342, 0.03751938]
critic loss [0.050151587, 0.036725961]
critic loss [0.045094587, 0.036058463]
train end 0.5944485664367676
should be 1.0, predicted: [ 0.50149679], target predicted: [ 0.22344737]
start searching new action
get new action:  0.24566292762756348
start getting new_q
get new_q:  0.0653390884399414
train start:
critic loss [0.078022286, 0.042166509]
critic loss [0.075139076, 0.042671356]
critic loss [0.070361704, 0.042850398]
critic loss [0.06458427, 0.042733502]
critic loss [0.058571965, 0.042310998]
train end 0.4145371913909912
should be 1.0, predicted: [ 0.68253464], target predicted: [ 0.27576932]
should be 1.0, predicted: [ 0.83684582], target predicted: [ 0.44545022]
should be 1.0, predicted: [ 0.60604578], target predicted: [ 0.31637341]
should be 1.0, predicted: [ 1.052

critic loss [0.054861732, 0.04209961]
critic loss [0.051591117, 0.041423284]
critic loss [0.0480799, 0.04089006]
critic loss [0.044904456, 0.040466454]
train end 0.41169142723083496
should be -1.0, predicted: [-0.99610633], target predicted: [-0.64778489]
should be 1.0, predicted: [ 0.85503113], target predicted: [ 0.71453708]
should be 1.0, predicted: [ 0.74583447], target predicted: [ 0.34334958]
Episode : 122 Replay Buffer 11362
TOTAL REWARD @ 122-th Episode  : Reward -1
Total Step: 13
start searching new action
get new action:  0.2450864315032959
start getting new_q
get new_q:  0.07312297821044922
train start:
critic loss [0.066651151, 0.039236836]
critic loss [0.065186851, 0.039084151]
critic loss [0.060485728, 0.038055778]
critic loss [0.053783275, 0.036444165]
critic loss [0.04656104, 0.034665465]
train end 0.40218138694763184
should be 1.0, predicted: [ 0.50330895], target predicted: [ 0.59967601]
start searching new action
get new action:  0.22286343574523926
start getting new

get new action:  0.231583833694458
start getting new_q
get new_q:  0.07228994369506836
train start:
critic loss [0.14108685, 0.058071706]
critic loss [0.12827529, 0.054656945]
critic loss [0.11010474, 0.050827786]
critic loss [0.090916574, 0.047493294]
critic loss [0.074107148, 0.045281731]
train end 0.4455864429473877
should be 1.0, predicted: [ 0.9409461], target predicted: [ 0.61850655]
should be 1.0, predicted: [ 0.55395448], target predicted: [ 0.21539499]
should be 1.0, predicted: [ 0.44056001], target predicted: [ 0.2579042]
should be 1.0, predicted: [ 0.91913342], target predicted: [ 0.26585838]
Episode : 124 Replay Buffer 11489
TOTAL REWARD @ 124-th Episode  : Reward 1
Total Step: 115
start searching new action
get new action:  0.24429106712341309
start getting new_q
get new_q:  0.06941676139831543
train start:
critic loss [0.10283346, 0.04662399]
critic loss [0.093760505, 0.048230447]
critic loss [0.083942503, 0.050133001]
critic loss [0.075111285, 0.052187219]
critic loss [0

critic loss [0.047786675, 0.044248879]
train end 0.38504672050476074
should be -1.0, predicted: [-0.93725854], target predicted: [-0.79240805]
should be 1.0, predicted: [ 0.89044398], target predicted: [ 1.04688179]
should be 1.0, predicted: [ 1.00458515], target predicted: [ 0.69317168]
should be 1.0, predicted: [ 0.78555554], target predicted: [ 0.70366347]
start searching new action
get new action:  0.24926376342773438
start getting new_q
get new_q:  0.08288788795471191
train start:
critic loss [0.057346459, 0.045308478]
critic loss [0.056176163, 0.044301983]
critic loss [0.053499158, 0.042772885]
critic loss [0.049803987, 0.040922388]
critic loss [0.045687936, 0.038964286]
train end 0.4003732204437256
should be -1.0, predicted: [-0.69118226], target predicted: [-0.49786398]
should be 1.0, predicted: [ 0.91588414], target predicted: [ 0.59421754]
start searching new action
get new action:  0.23979783058166504
start getting new_q
get new_q:  0.07245230674743652
train start:
critic lo

get new action:  0.22470521926879883
start getting new_q
get new_q:  0.06405282020568848
train start:
critic loss [0.085629739, 0.042457893]
critic loss [0.077995256, 0.040679391]
critic loss [0.067731634, 0.038655698]
critic loss [0.057478298, 0.037123337]
critic loss [0.049254201, 0.036517456]
train end 0.43297243118286133
should be 1.0, predicted: [ 1.05275607], target predicted: [ 0.65342861]
should be 1.0, predicted: [ 0.82306701], target predicted: [ 0.26347315]
should be 1.0, predicted: [ 0.96884894], target predicted: [ 0.39917994]
should be 1.0, predicted: [ 0.7248866], target predicted: [ 0.28482857]
should be -1.0, predicted: [-0.80655992], target predicted: [-0.64501899]
should be 1.0, predicted: [ 0.87370801], target predicted: [ 0.66820973]
start searching new action
get new action:  0.2386455535888672
start getting new_q
get new_q:  0.06473112106323242
train start:
critic loss [0.066088662, 0.045962553]
critic loss [0.064900465, 0.046796694]
critic loss [0.061512571, 0.0

critic loss [0.10464701, 0.043760348]
critic loss [0.095039666, 0.043123201]
critic loss [0.082202375, 0.041702092]
critic loss [0.068539679, 0.039784804]
train end 0.5309047698974609
should be 1.0, predicted: [ 0.66452324], target predicted: [ 0.66241091]
should be 1.0, predicted: [ 0.8524226], target predicted: [ 0.9315871]
should be 1.0, predicted: [ 0.22508709], target predicted: [-0.04192147]
should be 1.0, predicted: [ 0.83438653], target predicted: [ 0.81688207]
start searching new action
get new action:  0.24518656730651855
start getting new_q
get new_q:  0.06728315353393555
train start:
critic loss [0.049162127, 0.032222427]
critic loss [0.046852194, 0.031755209]
critic loss [0.043326825, 0.03110861]
critic loss [0.039305523, 0.030391734]
critic loss [0.035441499, 0.029698653]
train end 0.4613356590270996
should be 1.0, predicted: [ 0.6447137], target predicted: [ 0.15197326]
should be -1.0, predicted: [-1.09768856], target predicted: [-0.64731187]
should be 1.0, predicted: [ 

get new action:  0.23089814186096191
start getting new_q
get new_q:  0.06526899337768555
train start:
critic loss [0.088808656, 0.057536382]
critic loss [0.087645441, 0.05893277]
critic loss [0.082892232, 0.058943436]
critic loss [0.075731158, 0.05766993]
critic loss [0.067556046, 0.055394072]
train end 0.4152951240539551
should be 1.0, predicted: [ 0.55450565], target predicted: [ 0.40814546]
should be 1.0, predicted: [ 0.62627834], target predicted: [ 0.26652953]
start searching new action
get new action:  0.2454385757446289
start getting new_q
get new_q:  0.06489205360412598
train start:
critic loss [0.063395903, 0.048032522]
critic loss [0.061236784, 0.046590284]
critic loss [0.05660677, 0.044216871]
critic loss [0.050653346, 0.041379027]
critic loss [0.044569347, 0.038528346]
train end 0.40516161918640137
should be 1.0, predicted: [ 1.18266237], target predicted: [ 1.07585835]
should be 1.0, predicted: [ 0.73946768], target predicted: [ 0.03240132]
start searching new action
get n

critic loss [0.060037456, 0.055023793]
train end 0.4422001838684082
should be 1.0, predicted: [ 0.79826111], target predicted: [ 0.32316718]
should be 1.0, predicted: [ 1.12799215], target predicted: [ 0.59476537]
should be 1.0, predicted: [ 1.11831403], target predicted: [ 0.75416076]
should be 1.0, predicted: [ 0.89581513], target predicted: [ 0.72242701]
start searching new action
get new action:  0.2519967555999756
start getting new_q
get new_q:  0.06433606147766113
train start:
critic loss [0.099292561, 0.067311041]
critic loss [0.093330413, 0.064437553]
critic loss [0.084074482, 0.06017077]
critic loss [0.073894359, 0.055552982]
critic loss [0.064631388, 0.051366694]
train end 0.45708203315734863
should be -1.0, predicted: [-0.76877552], target predicted: [-0.44106188]
should be 1.0, predicted: [ 0.61295265], target predicted: [ 0.46124896]
should be 1.0, predicted: [ 1.1281004], target predicted: [ 0.77224702]
should be 1.0, predicted: [ 0.58030593], target predicted: [ 0.404137

Episode : 134 Replay Buffer 12591
TOTAL REWARD @ 134-th Episode  : Reward 1
Total Step: 126
start searching new action
get new action:  0.2583041191101074
start getting new_q
get new_q:  0.06853818893432617
train start:
critic loss [0.080932438, 0.074547797]
critic loss [0.090278313, 0.083816908]
critic loss [0.094731897, 0.088908568]
critic loss [0.093891449, 0.089226335]
critic loss [0.088562697, 0.085267678]
train end 0.4877941608428955
should be 1.0, predicted: [ 0.86293733], target predicted: [ 0.74793124]
should be -1.0, predicted: [-0.6689921], target predicted: [-0.70662904]
start searching new action
get new action:  0.24781131744384766
start getting new_q
get new_q:  0.06920886039733887
train start:
critic loss [0.085470222, 0.075697094]
critic loss [0.077399738, 0.069348574]
critic loss [0.068642616, 0.062346052]
critic loss [0.060556553, 0.055692151]
critic loss [0.053884886, 0.049952157]
train end 0.4819672107696533
should be 1.0, predicted: [ 0.79199922], target predicted

get new action:  0.23711490631103516
start getting new_q
get new_q:  0.07161092758178711
train start:
critic loss [0.053988673, 0.040250156]
critic loss [0.051511221, 0.038651615]
critic loss [0.047894675, 0.036778614]
critic loss [0.043779638, 0.034890074]
critic loss [0.039733149, 0.033156767]
train end 0.5040264129638672
should be 1.0, predicted: [ 0.93740714], target predicted: [ 0.87310344]
should be 1.0, predicted: [ 1.14224613], target predicted: [ 0.6934433]
should be 1.0, predicted: [ 0.78238845], target predicted: [ 0.75312412]
should be 1.0, predicted: [ 0.85090363], target predicted: [ 0.63336289]
start searching new action
get new action:  0.24599838256835938
start getting new_q
get new_q:  0.06670618057250977
train start:
critic loss [0.072274588, 0.055536613]
critic loss [0.067645624, 0.053509437]
critic loss [0.061934091, 0.050955433]
critic loss [0.055998616, 0.048223756]
critic loss [0.050511278, 0.045570951]
train end 0.5098755359649658
should be 1.0, predicted: [ 0.

critic loss [0.034388594, 0.034177326]
train end 0.5164175033569336
should be -1.0, predicted: [-0.94398493], target predicted: [-0.74165696]
start searching new action
get new action:  0.2458178997039795
start getting new_q
get new_q:  0.07120490074157715
train start:
critic loss [0.030269522, 0.030269522]
critic loss [0.029619504, 0.029619504]
critic loss [0.028833451, 0.028833451]
critic loss [0.027940912, 0.027940912]
critic loss [0.027012236, 0.027012236]
train end 0.41652584075927734
Episode : 138 Replay Buffer 12813
TOTAL REWARD @ 138-th Episode  : Reward -1
Total Step: 63
start searching new action
get new action:  0.2456803321838379
start getting new_q
get new_q:  0.0689554214477539
train start:
critic loss [0.038218506, 0.031973477]
critic loss [0.037430398, 0.031697415]
critic loss [0.0361083, 0.031196855]
critic loss [0.034449361, 0.030517161]
critic loss [0.03264714, 0.029699061]
train end 0.5367553234100342
should be 1.0, predicted: [ 0.77219421], target predicted: [ 0.58

get new action:  0.2350008487701416
start getting new_q
get new_q:  0.06660962104797363
train start:
critic loss [0.067672193, 0.05129654]
critic loss [0.064700678, 0.052668903]
critic loss [0.061030779, 0.052966155]
critic loss [0.057104133, 0.052133717]
critic loss [0.053057581, 0.050251313]
train end 0.5230560302734375
should be 1.0, predicted: [ 0.8136248], target predicted: [ 0.58653373]
should be 1.0, predicted: [ 0.88713282], target predicted: [ 0.05737269]
should be 1.0, predicted: [ 1.14560664], target predicted: [ 0.74699897]
start searching new action
get new action:  0.23988008499145508
start getting new_q
get new_q:  0.07255411148071289
train start:
critic loss [0.071645781, 0.05609547]
critic loss [0.068066746, 0.054490633]
critic loss [0.062863827, 0.052005507]
critic loss [0.056769229, 0.048882756]
critic loss [0.050585613, 0.045486365]
train end 0.4811108112335205
should be 1.0, predicted: [ 0.87585652], target predicted: [ 0.7567867]
should be 1.0, predicted: [ 0.8657

get new action:  0.24889779090881348
start getting new_q
get new_q:  0.06702470779418945
train start:
critic loss [0.0434242, 0.042390741]
critic loss [0.043103877, 0.042236481]
critic loss [0.042056706, 0.041290209]
critic loss [0.040339958, 0.039642345]
critic loss [0.038121786, 0.037470501]
train end 0.4406006336212158
should be 1.0, predicted: [ 0.93653196], target predicted: [ 0.92549324]
should be 1.0, predicted: [ 1.09619009], target predicted: [ 0.69318795]
should be 1.0, predicted: [ 1.00095022], target predicted: [ 0.68342763]
start searching new action
get new action:  0.23256969451904297
start getting new_q
get new_q:  0.06535100936889648
train start:
critic loss [0.05427843, 0.040338159]
critic loss [0.052181426, 0.038930532]
critic loss [0.049254861, 0.03742478]
critic loss [0.04582791, 0.035908908]
critic loss [0.042223163, 0.034444328]
train end 0.43379759788513184
should be 1.0, predicted: [ 0.76042002], target predicted: [ 0.2086651]
start searching new action
get new

get new action:  0.23842406272888184
start getting new_q
get new_q:  0.061937808990478516
train start:
critic loss [0.100086, 0.039439656]
critic loss [0.094347715, 0.038656745]
critic loss [0.084997043, 0.037719551]
critic loss [0.074056365, 0.03686817]
critic loss [0.063141972, 0.036246803]
train end 0.4404175281524658
should be 1.0, predicted: [ 0.47781369], target predicted: [ 0.493983]
should be -1.0, predicted: [-0.76516378], target predicted: [-0.27772585]
should be 1.0, predicted: [ 0.74145949], target predicted: [ 0.64169639]
should be 1.0, predicted: [ 0.95409274], target predicted: [ 0.58433914]
should be -1.0, predicted: [-0.84458435], target predicted: [-0.47342262]
should be 1.0, predicted: [ 0.74294674], target predicted: [ 0.29889283]
start searching new action
get new action:  0.2509891986846924
start getting new_q
get new_q:  0.07259821891784668
train start:
critic loss [0.047514234, 0.034863681]
critic loss [0.046549819, 0.035275437]
critic loss [0.044427663, 0.03522

Episode : 145 Replay Buffer 13737
TOTAL REWARD @ 145-th Episode  : Reward -1
Total Step: 135
start searching new action
get new action:  0.26716041564941406
start getting new_q
get new_q:  0.07453656196594238
train start:
critic loss [0.11848631, 0.055116422]
critic loss [0.11593473, 0.05655596]
critic loss [0.10808092, 0.056436937]
critic loss [0.096592516, 0.054916464]
critic loss [0.083421767, 0.052396037]
train end 0.574084997177124
should be 1.0, predicted: [ 0.43765032], target predicted: [-0.15162235]
should be 1.0, predicted: [ 0.96094191], target predicted: [ 0.8600927]
should be 1.0, predicted: [ 0.77639359], target predicted: [ 0.60113221]
should be 1.0, predicted: [ 0.38436082], target predicted: [ 0.18603061]
start searching new action
get new action:  0.26624059677124023
start getting new_q
get new_q:  0.08469629287719727
train start:
critic loss [0.047864623, 0.047864623]
critic loss [0.046609294, 0.046609294]
critic loss [0.045001842, 0.045001842]
critic loss [0.0432154

Episode : 147 Replay Buffer 13893
TOTAL REWARD @ 147-th Episode  : Reward 1
Total Step: 128
start searching new action
get new action:  0.23755359649658203
start getting new_q
get new_q:  0.0741891860961914
train start:
critic loss [0.078096487, 0.044691347]
critic loss [0.075771093, 0.046228901]
critic loss [0.07176476, 0.047315296]
critic loss [0.066696875, 0.047719851]
critic loss [0.061060153, 0.047304224]
train end 0.5035746097564697
should be 1.0, predicted: [ 0.74751365], target predicted: [ 0.65080243]
should be 1.0, predicted: [ 1.18336368], target predicted: [ 0.91159904]
should be 1.0, predicted: [ 0.63188362], target predicted: [ 0.31545401]
should be 1.0, predicted: [ 1.07891965], target predicted: [ 0.54579896]
should be 1.0, predicted: [ 0.8605659], target predicted: [ 0.49538383]
should be 1.0, predicted: [ 0.98918623], target predicted: [ 0.72666961]
start searching new action
get new action:  0.2353515625
start getting new_q
get new_q:  0.06394600868225098
train start

get new action:  0.23052978515625
start getting new_q
get new_q:  0.06818914413452148
train start:
critic loss [0.069136664, 0.048404515]
critic loss [0.066278383, 0.047702689]
critic loss [0.062575102, 0.046723157]
critic loss [0.058329571, 0.045474835]
critic loss [0.053826448, 0.043982465]
train end 0.45069241523742676
should be 1.0, predicted: [ 0.6586951], target predicted: [ 0.47345895]
should be 1.0, predicted: [ 0.90256631], target predicted: [ 0.38072848]
start searching new action
get new action:  0.23590636253356934
start getting new_q
get new_q:  0.0636758804321289
train start:
critic loss [0.096113667, 0.046675943]
critic loss [0.091613352, 0.04638816]
critic loss [0.08397495, 0.045531929]
critic loss [0.07457073, 0.044283807]
critic loss [0.064881772, 0.042886399]
train end 0.4179046154022217
should be 1.0, predicted: [ 0.73382235], target predicted: [ 0.623932]
should be 1.0, predicted: [ 0.83557475], target predicted: [ 0.39669314]
should be 1.0, predicted: [ 0.52294552

critic loss [0.051243283, 0.035891004]
critic loss [0.048480771, 0.034983382]
critic loss [0.045138627, 0.033938069]
critic loss [0.041398346, 0.032806553]
train end 1.6893939971923828
should be -1.0, predicted: [-0.92013288], target predicted: [-0.88131005]
should be 1.0, predicted: [ 0.59185034], target predicted: [ 0.34560463]
start searching new action
get new action:  0.2509465217590332
start getting new_q
get new_q:  0.06929969787597656
train start:
critic loss [0.062098369, 0.038621694]
critic loss [0.059082024, 0.037807055]
critic loss [0.054778736, 0.036776207]
critic loss [0.049871825, 0.03568285]
critic loss [0.045004524, 0.034656607]
actor loss -48.405
train end 3.406447649002075
should be 1.0, predicted: [ 1.01657367], target predicted: [ 0.99898577]
should be 1.0, predicted: [ 0.68637371], target predicted: [ 0.49023664]
should be 1.0, predicted: [ 0.67056048], target predicted: [ 0.46401414]
start searching new action
get new action:  0.257587194442749
start getting new_

critic loss [0.034947615, 0.034205873]
actor loss -58.8054
train end 0.5146293640136719
should be 1.0, predicted: [ 1.03324211], target predicted: [ 0.78510052]
should be 1.0, predicted: [ 1.00682712], target predicted: [ 0.81963897]
start searching new action
get new action:  0.2305147647857666
start getting new_q
get new_q:  0.05955243110656738
train start:
critic loss [0.049154155, 0.039395593]
critic loss [0.049122546, 0.039580353]
critic loss [0.047663368, 0.038972478]
critic loss [0.045050252, 0.037662487]
critic loss [0.041742943, 0.035866104]
actor loss -49.5675
train end 0.520209789276123
should be -1.0, predicted: [-0.86986476], target predicted: [-0.79412943]
should be 1.0, predicted: [ 0.69731379], target predicted: [ 0.4209559]
Episode : 152 Replay Buffer 14350
TOTAL REWARD @ 152-th Episode  : Reward 1
Total Step: 126
start searching new action
get new action:  0.24020910263061523
start getting new_q
get new_q:  0.06973552703857422
train start:
critic loss [0.031809926, 0.

critic loss [0.040963028, 0.034849949]
actor loss 5.7345
train end 0.5406718254089355
should be 1.0, predicted: [ 0.87369043], target predicted: [ 0.55288684]
should be 1.0, predicted: [ 0.87135273], target predicted: [ 0.87607068]
should be 1.0, predicted: [ 0.83026582], target predicted: [ 0.4992035]
should be 1.0, predicted: [ 1.09073424], target predicted: [ 0.82427287]
should be 1.0, predicted: [ 0.81968457], target predicted: [ 0.69239831]
start searching new action
get new action:  0.23288631439208984
start getting new_q
get new_q:  0.06551098823547363
train start:
critic loss [0.081219964, 0.039870087]
critic loss [0.078348368, 0.039604619]
critic loss [0.073070183, 0.038827211]
critic loss [0.066238903, 0.037675623]
critic loss [0.058810405, 0.036305439]
actor loss -42.8527
train end 0.6425681114196777
should be -1.0, predicted: [-0.62275952], target predicted: [-0.39169085]
should be 1.0, predicted: [ 0.82458472], target predicted: [ 0.64654225]
should be 1.0, predicted: [ 0.

critic loss [0.053603761, 0.049673222]
critic loss [0.05059009, 0.047486901]
critic loss [0.047299832, 0.045193847]
critic loss [0.044169642, 0.043004792]
actor loss -89.6456
train end 0.5797898769378662
should be 1.0, predicted: [ 0.82223338], target predicted: [ 0.58738333]
start searching new action
get new action:  0.25219249725341797
start getting new_q
get new_q:  0.06785249710083008
train start:
critic loss [0.046620522, 0.043312185]
critic loss [0.045581289, 0.042516842]
critic loss [0.043232456, 0.04078906]
critic loss [0.040091254, 0.03844678]
critic loss [0.036747269, 0.03586866]
actor loss -42.6547
train end 0.5233709812164307
should be 1.0, predicted: [ 0.88177377], target predicted: [ 0.76039737]
start searching new action
get new action:  0.2274165153503418
start getting new_q
get new_q:  0.05978536605834961
train start:
critic loss [0.072049104, 0.042892419]
critic loss [0.07048507, 0.041571826]
critic loss [0.06720835, 0.04019013]
critic loss [0.06286411, 0.038776353]


get new action:  0.24925541877746582
start getting new_q
get new_q:  0.06525015830993652
train start:
critic loss [0.041375093, 0.041375093]
critic loss [0.042629518, 0.042629518]
critic loss [0.043776404, 0.043776404]
critic loss [0.044354558, 0.044354558]
critic loss [0.044055905, 0.044055905]
actor loss 3.78321
train end 0.6289145946502686
start searching new action
get new action:  0.2767014503479004
start getting new_q
get new_q:  0.07236218452453613
train start:
critic loss [0.11653775, 0.064027518]
critic loss [0.10906404, 0.060662672]
critic loss [0.096025571, 0.055145398]
critic loss [0.080436334, 0.048775181]
critic loss [0.065090016, 0.042792939]
actor loss -79.2322
train end 0.6502439975738525
should be 1.0, predicted: [ 0.93331641], target predicted: [ 0.77211696]
should be -1.0, predicted: [-0.53538829], target predicted: [-0.26207596]
should be 1.0, predicted: [ 1.07331812], target predicted: [ 0.70398194]
should be 1.0, predicted: [ 0.75173187], target predicted: [ 0.41

critic loss [0.055055648, 0.051396981]
actor loss -39.9766
train end 0.5911684036254883
should be 1.0, predicted: [ 0.94136047], target predicted: [ 0.53342634]
should be 1.0, predicted: [ 0.73257482], target predicted: [ 0.69209355]
start searching new action
get new action:  0.23444294929504395
start getting new_q
get new_q:  0.06252884864807129
train start:
critic loss [0.099893376, 0.050899029]
critic loss [0.092220038, 0.04806016]
critic loss [0.081297293, 0.04458677]
critic loss [0.069680437, 0.041173842]
critic loss [0.059317257, 0.038289718]
actor loss -27.2769
train end 0.5650646686553955
should be 1.0, predicted: [ 0.50195134], target predicted: [ 0.3334296]
should be -1.0, predicted: [-0.66283154], target predicted: [-0.258131]
should be 1.0, predicted: [ 0.74160624], target predicted: [ 0.5838933]
should be 1.0, predicted: [ 0.66456729], target predicted: [ 0.70025289]
start searching new action
get new action:  0.2575109004974365
start getting new_q
get new_q:  0.074625015

critic loss [0.090888627, 0.039251644]
critic loss [0.084024943, 0.037860505]
critic loss [0.07573688, 0.036373623]
critic loss [0.066850759, 0.034918431]
actor loss -6.28362
train end 0.506697416305542
should be -1.0, predicted: [-0.32979161], target predicted: [-0.01000531]
should be 1.0, predicted: [ 0.53090435], target predicted: [ 0.14525603]
start searching new action
get new action:  0.22810792922973633
start getting new_q
get new_q:  0.06531786918640137
train start:
critic loss [0.036831871, 0.035993274]
critic loss [0.036909208, 0.036011115]
critic loss [0.036508642, 0.035659932]
critic loss [0.035603423, 0.034898333]
critic loss [0.034256518, 0.033747807]
actor loss -68.1467
train end 0.5256204605102539
should be 1.0, predicted: [ 0.92598212], target predicted: [ 0.90928143]
should be 1.0, predicted: [ 0.98973495], target predicted: [ 0.78551352]
start searching new action
get new action:  0.2241532802581787
start getting new_q
get new_q:  0.0676109790802002
train start:
crit

get new action:  0.23670315742492676
start getting new_q
get new_q:  0.06225895881652832
train start:
critic loss [0.047685139, 0.045235641]
critic loss [0.047550645, 0.044978961]
critic loss [0.046892397, 0.044366658]
critic loss [0.045754299, 0.043396559]
critic loss [0.044203922, 0.042077109]
actor loss -64.1412
train end 0.5201904773712158
should be 1.0, predicted: [ 0.8510133], target predicted: [ 0.82343674]
should be 1.0, predicted: [ 0.94811052], target predicted: [ 0.90521818]
should be 1.0, predicted: [ 0.68481821], target predicted: [ 0.71503067]
should be -1.0, predicted: [-1.12119603], target predicted: [-0.76533574]
start searching new action
get new action:  0.2345900535583496
start getting new_q
get new_q:  0.06461715698242188
train start:
critic loss [0.076929107, 0.042994022]
critic loss [0.075169235, 0.0423171]
critic loss [0.071394615, 0.041337479]
critic loss [0.066100724, 0.040084258]
critic loss [0.059946485, 0.038625441]
actor loss -13.8228
train end 0.512685298

Episode : 164 Replay Buffer 15226
TOTAL REWARD @ 164-th Episode  : Reward -1
Total Step: 150
start searching new action
get new action:  0.23826217651367188
start getting new_q
get new_q:  0.06437325477600098
train start:
critic loss [0.049566984, 0.042610712]
critic loss [0.048048969, 0.042454943]
critic loss [0.04595226, 0.041845221]
critic loss [0.043579608, 0.040862679]
critic loss [0.041188493, 0.039609574]
actor loss -4.12154
train end 0.5830080509185791
should be 1.0, predicted: [ 0.93819147], target predicted: [ 0.5094586]
should be 1.0, predicted: [ 0.85166276], target predicted: [ 0.82346565]
should be 1.0, predicted: [ 1.00221765], target predicted: [ 0.68467969]
start searching new action
get new action:  0.23107266426086426
start getting new_q
get new_q:  0.06805276870727539
train start:
critic loss [0.045626439, 0.041941054]
critic loss [0.043961458, 0.040951122]
critic loss [0.041815124, 0.039633889]
critic loss [0.039546352, 0.03816146]
critic loss [0.037403971, 0.03665

get new action:  0.23755931854248047
start getting new_q
get new_q:  0.07485270500183105
train start:
critic loss [0.052621175, 0.04444506]
critic loss [0.050337799, 0.041794822]
critic loss [0.0473365, 0.039064884]
critic loss [0.044024911, 0.036551289]
critic loss [0.040719554, 0.034414843]
actor loss 19.4714
train end 0.6673436164855957
should be 1.0, predicted: [ 0.85315853], target predicted: [ 0.55309558]
should be 1.0, predicted: [ 0.84080517], target predicted: [ 0.64052123]
should be 1.0, predicted: [ 0.76213151], target predicted: [ 0.58029038]
start searching new action
get new action:  0.24024510383605957
start getting new_q
get new_q:  0.062496185302734375
train start:
critic loss [0.049877405, 0.037686005]
critic loss [0.047917657, 0.037196964]
critic loss [0.04506053, 0.036278393]
critic loss [0.041829646, 0.035064422]
critic loss [0.038536888, 0.033653788]
actor loss -2.14883
train end 0.5897495746612549
should be -1.0, predicted: [-0.56029856], target predicted: [-0.56

critic loss [0.038200811, 0.037147954]
actor loss 5.55007
train end 0.5305502414703369
should be 1.0, predicted: [ 1.13369894], target predicted: [ 0.84762478]
start searching new action
get new action:  0.2530055046081543
start getting new_q
get new_q:  0.08836174011230469
train start:
critic loss [0.057200611, 0.039764322]
critic loss [0.055052482, 0.039658725]
critic loss [0.052193172, 0.039524339]
critic loss [0.049008533, 0.039304838]
critic loss [0.045805387, 0.038909703]
actor loss -31.5499
train end 0.6820085048675537
should be 1.0, predicted: [ 0.70088375], target predicted: [ 0.18332511]
start searching new action
get new action:  0.24020886421203613
start getting new_q
get new_q:  0.08046531677246094
train start:
critic loss [0.054001339, 0.041335672]
critic loss [0.053169467, 0.041604299]
critic loss [0.051032133, 0.041267648]
critic loss [0.047998589, 0.040388934]
critic loss [0.044526272, 0.039083555]
actor loss -5.04354
train end 0.5757126808166504
should be 1.0, predict

get new action:  0.23368000984191895
start getting new_q
get new_q:  0.0657644271850586
train start:
critic loss [0.076741822, 0.04334905]
critic loss [0.073574558, 0.042360656]
critic loss [0.068000741, 0.040988851]
critic loss [0.060975, 0.039399981]
critic loss [0.05352072, 0.037762579]
actor loss -25.5878
train end 0.5008056163787842
should be 1.0, predicted: [ 0.76360041], target predicted: [ 0.62581819]
should be 1.0, predicted: [ 0.8872878], target predicted: [ 0.737517]
should be 1.0, predicted: [ 0.7291382], target predicted: [ 0.38397038]
should be 1.0, predicted: [ 0.44253078], target predicted: [ 0.22305858]
start searching new action
get new action:  0.23598480224609375
start getting new_q
get new_q:  0.061310768127441406
train start:
critic loss [0.07140892, 0.03071272]
critic loss [0.060823102, 0.028879825]
critic loss [0.047267679, 0.02703955]
critic loss [0.035572626, 0.026083576]
critic loss [0.029094342, 0.026417099]
actor loss 0.858868
train end 0.5460333824157715
s

get new action:  0.27425146102905273
start getting new_q
get new_q:  0.06807732582092285
train start:
critic loss [0.095715851, 0.041563891]
critic loss [0.092225581, 0.041979939]
critic loss [0.085031852, 0.041583613]
critic loss [0.075483479, 0.040512279]
critic loss [0.065028474, 0.0389762]
actor loss -62.6647
train end 0.5955257415771484
should be 1.0, predicted: [ 0.3019484], target predicted: [ 0.1609305]
should be 1.0, predicted: [ 1.04310811], target predicted: [ 0.95339179]
should be -1.0, predicted: [-0.72023499], target predicted: [-0.211138]
should be -1.0, predicted: [-0.76917291], target predicted: [-0.53940415]
should be 1.0, predicted: [ 0.91421586], target predicted: [ 0.76324534]
start searching new action
get new action:  0.23989295959472656
start getting new_q
get new_q:  0.06412482261657715
train start:
critic loss [0.069870509, 0.047146577]
critic loss [0.068049148, 0.045990262]
critic loss [0.064292066, 0.044244334]
critic loss [0.059217319, 0.0421164]
critic los

get new action:  0.23591232299804688
start getting new_q
get new_q:  0.07141637802124023
train start:
critic loss [0.096409611, 0.042789869]
critic loss [0.092829406, 0.042439289]
critic loss [0.084833145, 0.041495632]
critic loss [0.074233659, 0.040179253]
critic loss [0.063011542, 0.038786709]
actor loss 26.8267
train end 0.819481611251831
should be 1.0, predicted: [ 0.65614539], target predicted: [ 0.62279296]
should be 1.0, predicted: [ 0.9093467], target predicted: [ 0.65765923]
should be -1.0, predicted: [-0.64143455], target predicted: [-0.54958981]
should be 1.0, predicted: [ 0.98213339], target predicted: [ 0.95266747]
should be 1.0, predicted: [ 0.8041563], target predicted: [ 0.6362198]
start searching new action
get new action:  0.25813937187194824
start getting new_q
get new_q:  0.07094001770019531
train start:
critic loss [0.060727596, 0.044942562]
critic loss [0.057537258, 0.044695504]
critic loss [0.053256851, 0.043990597]
critic loss [0.04886549, 0.043004364]
critic lo

get new action:  0.2490839958190918
start getting new_q
get new_q:  0.06963324546813965
train start:
critic loss [0.12104266, 0.046904139]
critic loss [0.11569001, 0.045083955]
critic loss [0.1010014, 0.042091541]
critic loss [0.081940979, 0.039024033]
critic loss [0.063664369, 0.036982775]
actor loss -23.1019
train end 0.5831325054168701
should be 1.0, predicted: [ 1.14221823], target predicted: [ 0.68360269]
should be 1.0, predicted: [ 0.81264383], target predicted: [ 0.81936008]
should be 1.0, predicted: [ 0.83423191], target predicted: [ 0.10427922]
should be 1.0, predicted: [ 0.86863774], target predicted: [ 0.44074106]
should be 1.0, predicted: [ 0.8033793], target predicted: [ 0.22085232]
start searching new action
get new action:  0.2652430534362793
start getting new_q
get new_q:  0.06621909141540527
train start:
critic loss [0.064712271, 0.043217145]
critic loss [0.063489825, 0.045561332]
critic loss [0.061816923, 0.047763065]
critic loss [0.059525453, 0.049282998]
critic loss

get new action:  0.22832036018371582
start getting new_q
get new_q:  0.0708925724029541
train start:
critic loss [0.063460812, 0.050315101]
critic loss [0.059666682, 0.051824056]
critic loss [0.057696607, 0.052543115]
critic loss [0.056192741, 0.051956803]
critic loss [0.054185316, 0.049898751]
actor loss -38.7434
train end 0.5349879264831543
should be 1.0, predicted: [ 1.05585277], target predicted: [ 1.10499513]
should be 1.0, predicted: [ 0.85712856], target predicted: [ 0.38869834]
should be 1.0, predicted: [ 1.18893707], target predicted: [ 0.20549054]
should be 1.0, predicted: [ 0.61803097], target predicted: [ 0.60818923]
Episode : 176 Replay Buffer 16295
TOTAL REWARD @ 176-th Episode  : Reward 1
Total Step: 38
start searching new action
get new action:  0.2550327777862549
start getting new_q
get new_q:  0.08744454383850098
train start:
critic loss [0.057001069, 0.057001069]
critic loss [0.054339267, 0.054339267]
critic loss [0.050872561, 0.050872561]
critic loss [0.047086973, 0

critic loss [0.042827554, 0.039625619]
critic loss [0.042412207, 0.039315492]
critic loss [0.041210812, 0.038529169]
critic loss [0.039482225, 0.037337419]
critic loss [0.037523869, 0.035856783]
actor loss -78.5509
train end 0.6723587512969971
should be 1.0, predicted: [ 0.94605935], target predicted: [ 0.91410166]
should be -1.0, predicted: [-0.95345759], target predicted: [-0.68193334]
should be 1.0, predicted: [ 0.90139484], target predicted: [ 1.04823709]
start searching new action
get new action:  0.24796485900878906
start getting new_q
get new_q:  0.06462621688842773
train start:
critic loss [0.070281878, 0.035767037]
critic loss [0.064191043, 0.034256063]
critic loss [0.055819437, 0.032412402]
critic loss [0.046996582, 0.030589387]
critic loss [0.039147478, 0.029047854]
actor loss -2.39243
train end 0.5652728080749512
should be 1.0, predicted: [ 1.18009615], target predicted: [ 0.95642406]
should be 1.0, predicted: [ 0.88040525], target predicted: [ 0.68691468]
should be 1.0, pr

critic loss [0.048471835, 0.035582766]
actor loss -95.3642
train end 0.5175690650939941
should be -1.0, predicted: [-0.66597527], target predicted: [-0.40638644]
should be -1.0, predicted: [-0.67974073], target predicted: [-0.44756657]
should be -1.0, predicted: [-0.6771701], target predicted: [-0.36807889]
start searching new action
get new action:  0.24332547187805176
start getting new_q
get new_q:  0.06823515892028809
train start:
critic loss [0.074867651, 0.039235272]
critic loss [0.074838862, 0.040358096]
critic loss [0.070938811, 0.040420622]
critic loss [0.064117856, 0.039393958]
critic loss [0.055686336, 0.037563398]
actor loss -29.6122
train end 0.5109870433807373
should be 1.0, predicted: [ 1.15795064], target predicted: [ 0.74294668]
should be 1.0, predicted: [ 1.13680756], target predicted: [ 0.72057432]
should be 1.0, predicted: [ 0.35831469], target predicted: [ 0.43132025]
start searching new action
get new action:  0.2394411563873291
start getting new_q
get new_q:  0.06

critic loss [0.043364279, 0.032713406]
actor loss -40.0775
train end 0.5229756832122803
should be -1.0, predicted: [-0.74062592], target predicted: [-0.26368779]
should be 1.0, predicted: [ 1.05133915], target predicted: [ 0.77605259]
should be 1.0, predicted: [ 0.59120685], target predicted: [ 0.34536159]
start searching new action
get new action:  0.23178791999816895
start getting new_q
get new_q:  0.06583809852600098
train start:
critic loss [0.20457251, 0.0610498]
critic loss [0.18864554, 0.060184196]
critic loss [0.16381787, 0.058487356]
critic loss [0.13604888, 0.056649081]
critic loss [0.1101912, 0.055075273]
actor loss -86.5073
train end 0.5904924869537354
should be -1.0, predicted: [-0.3204492], target predicted: [ 0.1287725]
should be 1.0, predicted: [ 0.40115824], target predicted: [ 0.23057023]
should be -1.0, predicted: [-0.53620929], target predicted: [-0.24291277]
should be -1.0, predicted: [-0.83364618], target predicted: [-0.71051717]
Episode : 181 Replay Buffer 16900


get new action:  0.2549018859863281
start getting new_q
get new_q:  0.0802309513092041
train start:
critic loss [0.059471305, 0.056863774]
critic loss [0.057548001, 0.055954732]
critic loss [0.05435434, 0.053561524]
critic loss [0.050498337, 0.050183065]
critic loss [0.046565875, 0.046383478]
actor loss -40.536
train end 0.6206848621368408
should be 1.0, predicted: [ 1.01909971], target predicted: [ 0.8578319]
should be 1.0, predicted: [ 0.91398114], target predicted: [ 0.95702642]
should be 1.0, predicted: [ 1.10137081], target predicted: [ 0.78774267]
start searching new action
get new action:  0.24474072456359863
start getting new_q
get new_q:  0.0731513500213623
train start:
critic loss [0.079914957, 0.054289103]
critic loss [0.073339775, 0.051013265]
critic loss [0.066102684, 0.047858234]
critic loss [0.059533149, 0.045272704]
critic loss [0.054413106, 0.043427095]
actor loss -19.8256
train end 0.5366034507751465
should be 1.0, predicted: [ 1.02160656], target predicted: [ 0.74595

get new action:  0.24547982215881348
start getting new_q
get new_q:  0.06883430480957031
train start:
critic loss [0.084599517, 0.050959475]
critic loss [0.082375005, 0.050827891]
critic loss [0.075327531, 0.04875071]
critic loss [0.065532893, 0.045487043]
critic loss [0.055333741, 0.041943632]
actor loss -113.545
train end 0.8374052047729492
should be 1.0, predicted: [ 1.02829099], target predicted: [ 0.9456172]
should be -1.0, predicted: [-0.72056037], target predicted: [-0.61465544]
should be 1.0, predicted: [ 0.80840051], target predicted: [ 0.56377429]
start searching new action
get new action:  0.23517727851867676
start getting new_q
get new_q:  0.06123971939086914
train start:
critic loss [0.055932704, 0.038530249]
critic loss [0.053779703, 0.038075004]
critic loss [0.051060215, 0.038016312]
critic loss [0.048102695, 0.038115371]
critic loss [0.045122471, 0.038103525]
actor loss -109.145
train end 0.5427863597869873
should be 1.0, predicted: [ 0.73142785], target predicted: [ 0.

critic loss [0.053969067, 0.04487218]
critic loss [0.052158572, 0.044863954]
critic loss [0.049970202, 0.044435278]
critic loss [0.047535203, 0.043559775]
actor loss -60.7438
train end 0.5202634334564209
should be 1.0, predicted: [ 0.92837369], target predicted: [ 0.79690766]
should be 1.0, predicted: [ 1.10818803], target predicted: [ 0.6323967]
should be 1.0, predicted: [ 0.67384189], target predicted: [ 0.35061651]
start searching new action
get new action:  0.2302711009979248
start getting new_q
get new_q:  0.06247591972351074
train start:
critic loss [0.14189076, 0.056122232]
critic loss [0.13436143, 0.054649599]
critic loss [0.12156522, 0.052085467]
critic loss [0.10580136, 0.048892327]
critic loss [0.089086056, 0.045498502]
actor loss -70.2528
train end 0.5174286365509033
should be 1.0, predicted: [ 0.91972423], target predicted: [ 0.51576662]
should be 1.0, predicted: [ 1.07909524], target predicted: [ 0.91477072]
should be 1.0, predicted: [ 0.49662855], target predicted: [ 0.0

get new action:  0.23773670196533203
start getting new_q
get new_q:  0.06484794616699219
train start:
critic loss [0.088280879, 0.0454375]
critic loss [0.086605042, 0.046706069]
critic loss [0.082711414, 0.047385857]
critic loss [0.076714218, 0.047278494]
critic loss [0.069455966, 0.046392731]
actor loss -56.7015
train end 0.7956931591033936
should be 1.0, predicted: [ 1.04489529], target predicted: [ 0.42866212]
should be -1.0, predicted: [-0.09936155], target predicted: [-0.08673988]
should be 1.0, predicted: [ 1.31467688], target predicted: [ 0.39501539]
start searching new action
get new action:  0.2502939701080322
start getting new_q
get new_q:  0.06254744529724121
train start:
critic loss [0.10163336, 0.046832517]
critic loss [0.094767027, 0.04494112]
critic loss [0.082888298, 0.042122744]
critic loss [0.069144778, 0.039207563]
critic loss [0.056459926, 0.036840577]
actor loss -38.3988
train end 0.5571556091308594
should be 1.0, predicted: [ 1.24182951], target predicted: [ 0.881

get new action:  0.2447800636291504
start getting new_q
get new_q:  0.06714129447937012
train start:
critic loss [0.084918134, 0.080064207]
critic loss [0.077000216, 0.073714033]
critic loss [0.067808248, 0.066002652]
critic loss [0.059374891, 0.058324903]
critic loss [0.052966483, 0.051653102]
actor loss -53.5068
train end 0.5367531776428223
should be 1.0, predicted: [ 0.97985911], target predicted: [ 1.04684114]
should be 1.0, predicted: [ 1.06675982], target predicted: [ 0.8239767]
should be 1.0, predicted: [ 0.92818898], target predicted: [ 0.41865551]
start searching new action
get new action:  0.2535514831542969
start getting new_q
get new_q:  0.08162903785705566
train start:
critic loss [0.064759865, 0.052862559]
critic loss [0.061936721, 0.048091859]
critic loss [0.059102833, 0.044395886]
critic loss [0.055830192, 0.041441478]
critic loss [0.051873218, 0.038906947]
actor loss -125.521
train end 0.6892805099487305
should be 1.0, predicted: [ 0.64656723], target predicted: [ 0.43

critic loss [0.036173262, 0.035379715]
actor loss -91.1244
train end 0.5584096908569336
should be 1.0, predicted: [ 0.86105031], target predicted: [ 0.68144804]
start searching new action
get new action:  0.24269843101501465
start getting new_q
get new_q:  0.06937980651855469
train start:
critic loss [0.051369742, 0.046007063]
critic loss [0.049539596, 0.044774298]
critic loss [0.046891987, 0.043090031]
critic loss [0.0438921, 0.041153207]
critic loss [0.040943682, 0.039139524]
actor loss -117.636
train end 0.8704907894134521
should be 1.0, predicted: [ 0.65998292], target predicted: [ 0.33539066]
should be 1.0, predicted: [ 0.89462876], target predicted: [ 0.90927446]
Episode : 191 Replay Buffer 18014
TOTAL REWARD @ 191-th Episode  : Reward 1
Total Step: 124
start searching new action
get new action:  0.27227044105529785
start getting new_q
get new_q:  0.08084607124328613
train start:
critic loss [0.04286316, 0.042099744]
critic loss [0.041714393, 0.040930226]
critic loss [0.040297389

get new action:  0.24979400634765625
start getting new_q
get new_q:  0.06675982475280762
train start:
critic loss [0.068070836, 0.035886839]
critic loss [0.065720692, 0.034300312]
critic loss [0.061954215, 0.032948002]
critic loss [0.057144109, 0.031797662]
critic loss [0.051818602, 0.030810609]
actor loss -122.448
train end 0.5167908668518066
should be 1.0, predicted: [ 0.18656792], target predicted: [ 0.22525659]
should be -1.0, predicted: [-0.87153876], target predicted: [-0.61135346]
should be 1.0, predicted: [ 0.82940733], target predicted: [ 0.82532483]
start searching new action
get new action:  0.23741960525512695
start getting new_q
get new_q:  0.06354999542236328
train start:
critic loss [0.062864825, 0.031568289]
critic loss [0.061555382, 0.031435031]
critic loss [0.05818025, 0.030969549]
critic loss [0.053300165, 0.030213388]
critic loss [0.047873341, 0.029292915]
actor loss -128.475
train end 0.5734922885894775
should be -1.0, predicted: [-1.01824141], target predicted: [-

get new action:  0.22986340522766113
start getting new_q
get new_q:  0.06229233741760254
train start:
critic loss [0.04267846, 0.041964658]
critic loss [0.043824188, 0.042691775]
critic loss [0.044155963, 0.042752132]
critic loss [0.043522798, 0.042060412]
critic loss [0.041997686, 0.040686339]
actor loss -70.8627
train end 0.5323097705841064
should be -1.0, predicted: [-1.23641562], target predicted: [-0.96585447]
start searching new action
get new action:  0.23183894157409668
start getting new_q
get new_q:  0.11486244201660156
train start:
critic loss [0.20933817, 0.051518429]
critic loss [0.19067135, 0.047573049]
critic loss [0.15968364, 0.041940399]
critic loss [0.12420921, 0.036660224]
critic loss [0.09155038, 0.033390887]
actor loss 37.7635
train end 0.583122730255127
should be 1.0, predicted: [ 0.46452025], target predicted: [-0.05667718]
should be 1.0, predicted: [ 0.62828219], target predicted: [ 0.17453592]
should be 1.0, predicted: [ 0.28314096], target predicted: [ 0.091595

critic loss [0.12482833, 0.047412567]
critic loss [0.11417526, 0.045518599]
critic loss [0.10155883, 0.043404728]
critic loss [0.088259161, 0.041314777]
actor loss -38.7706
train end 0.5110955238342285
should be 1.0, predicted: [ 0.81152415], target predicted: [ 0.81379008]
should be 1.0, predicted: [ 0.1015323], target predicted: [-0.07106556]
should be 1.0, predicted: [ 0.71673965], target predicted: [ 0.64212489]
should be 1.0, predicted: [ 0.62896049], target predicted: [ 0.29054338]
should be 1.0, predicted: [ 0.47800285], target predicted: [ 0.36237758]
start searching new action
get new action:  0.2565293312072754
start getting new_q
get new_q:  0.07246804237365723
train start:
critic loss [0.06084048, 0.051413525]
critic loss [0.059447184, 0.051011264]
critic loss [0.057350427, 0.050206386]
critic loss [0.054786179, 0.049070857]
critic loss [0.051967062, 0.047667012]
actor loss -11.7819
train end 0.6788022518157959
should be 1.0, predicted: [ 0.65725726], target predicted: [ 0.

get new action:  0.22473359107971191
start getting new_q
get new_q:  0.06525492668151855
train start:
critic loss [0.030469954, 0.023493866]
critic loss [0.028408833, 0.023236983]
critic loss [0.026245743, 0.023008142]
critic loss [0.02450927, 0.022849027]
critic loss [0.023468096, 0.022761609]
actor loss -1.87707
train end 0.512690544128418
should be 1.0, predicted: [ 0.78226066], target predicted: [ 0.67291462]
should be 1.0, predicted: [ 0.90981674], target predicted: [ 0.75857419]
start searching new action
get new action:  0.23276782035827637
start getting new_q
get new_q:  0.06166696548461914
train start:
critic loss [0.031124417, 0.031124417]
critic loss [0.031500001, 0.031500001]
critic loss [0.031452134, 0.031452134]
critic loss [0.031000707, 0.031000707]
critic loss [0.030195169, 0.030195169]
actor loss 28.6062
train end 0.5356969833374023
start searching new action
get new action:  0.23343539237976074
start getting new_q
get new_q:  0.06018328666687012
train start:
critic lo

Episode : 200 Replay Buffer 18657
TOTAL REWARD @ 200-th Episode  : Reward 1
Total Step: 40
start searching new action
get new action:  0.23508834838867188
start getting new_q
get new_q:  0.06414103507995605
train start:
critic loss [0.065990761, 0.045552712]
critic loss [0.063514017, 0.044622026]
critic loss [0.059121124, 0.042882696]
critic loss [0.053728618, 0.040785041]
critic loss [0.048153415, 0.038698032]
actor loss -87.7546
train end 0.5713012218475342
should be -1.0, predicted: [-0.8027249], target predicted: [-0.55452615]
should be 1.0, predicted: [ 0.64000839], target predicted: [ 0.48018897]
should be 1.0, predicted: [ 0.78514796], target predicted: [ 0.97069985]
start searching new action
get new action:  0.22846245765686035
start getting new_q
get new_q:  0.06556487083435059
train start:
critic loss [0.058547549, 0.032107003]
critic loss [0.054856848, 0.032093607]
critic loss [0.049509048, 0.032207258]
critic loss [0.043846004, 0.032414474]
critic loss [0.038969923, 0.0325

critic loss [0.027170992, 0.024508432]
actor loss -19.3852
train end 0.9254148006439209
should be 1.0, predicted: [ 1.1453191], target predicted: [ 1.04326677]
should be 1.0, predicted: [ 0.85741878], target predicted: [ 0.43912181]
start searching new action
get new action:  0.24492430686950684
start getting new_q
get new_q:  0.06586170196533203
train start:
critic loss [0.053322218, 0.035617203]
critic loss [0.050920296, 0.035157517]
critic loss [0.047238886, 0.034363344]
critic loss [0.04301437, 0.033378959]
critic loss [0.03878291, 0.032306559]
actor loss -65.3219
train end 0.6375110149383545
should be -1.0, predicted: [-0.63743991], target predicted: [-0.48169154]
Episode : 202 Replay Buffer 18944
TOTAL REWARD @ 202-th Episode  : Reward -1
Total Step: 145
start searching new action
get new action:  0.24333572387695312
start getting new_q
get new_q:  0.06652569770812988
train start:
critic loss [0.079788677, 0.038020939]
critic loss [0.076587781, 0.037057493]
critic loss [0.0703207

critic loss [0.11166486, 0.048789393]
critic loss [0.095583566, 0.04709015]
critic loss [0.078542113, 0.045323148]
critic loss [0.063969977, 0.044027571]
actor loss 5.79658
train end 0.6992137432098389
should be 1.0, predicted: [ 0.69975644], target predicted: [ 0.20916905]
should be 1.0, predicted: [ 0.42861658], target predicted: [-0.20305958]
start searching new action
get new action:  0.27200746536254883
start getting new_q
get new_q:  0.06833314895629883
train start:
critic loss [0.085214511, 0.051092971]
critic loss [0.08444421, 0.051971171]
critic loss [0.08062011, 0.051473871]
critic loss [0.074064597, 0.049583219]
critic loss [0.065737747, 0.046644188]
actor loss 40.0609
train end 0.6141893863677979
should be 1.0, predicted: [ 0.59185761], target predicted: [ 0.5552181]
should be 1.0, predicted: [ 0.82650274], target predicted: [ 0.4710246]
should be 1.0, predicted: [ 0.94604582], target predicted: [ 0.19825651]
should be -1.0, predicted: [-0.2939283], target predicted: [-0.28

get new action:  0.2733023166656494
start getting new_q
get new_q:  0.06598496437072754
train start:
critic loss [0.037158221, 0.037158221]
critic loss [0.036555473, 0.036555473]
critic loss [0.035503484, 0.035503484]
critic loss [0.03407808, 0.03407808]
critic loss [0.032389503, 0.032389503]
actor loss -153.149
train end 0.7926597595214844
start searching new action
get new action:  0.2273099422454834
start getting new_q
get new_q:  0.07205629348754883
train start:
critic loss [0.063244581, 0.037461624]
critic loss [0.059735022, 0.035789154]
critic loss [0.053727545, 0.033737414]
critic loss [0.04682764, 0.031670205]
critic loss [0.040407788, 0.02986392]
actor loss 21.8965
train end 0.5821616649627686
should be 1.0, predicted: [ 1.06946635], target predicted: [ 0.8349725]
should be 1.0, predicted: [ 0.91560298], target predicted: [ 0.63062888]
should be -1.0, predicted: [-0.61235595], target predicted: [-0.81145006]
should be 1.0, predicted: [ 0.89738297], target predicted: [ 0.766476

get new action:  0.24358868598937988
start getting new_q
get new_q:  0.06586170196533203
train start:
critic loss [0.071516335, 0.042046532]
critic loss [0.067775398, 0.040910125]
critic loss [0.061688475, 0.039114263]
critic loss [0.054365475, 0.036941245]
critic loss [0.046917349, 0.034686979]
actor loss -47.492
train end 0.6414203643798828
should be 1.0, predicted: [ 0.7208159], target predicted: [ 0.47260547]
should be 1.0, predicted: [ 1.1149174], target predicted: [ 0.97257245]
should be 1.0, predicted: [ 0.72646862], target predicted: [ 0.52224672]
should be 1.0, predicted: [ 0.83165592], target predicted: [ 0.63303083]
start searching new action
get new action:  0.2369213104248047
start getting new_q
get new_q:  0.06224989891052246
train start:
critic loss [0.047756203, 0.033548936]
critic loss [0.04507716, 0.033096947]
critic loss [0.041699417, 0.032733276]
critic loss [0.038412161, 0.032589454]
critic loss [0.035781845, 0.032651328]
actor loss -126.255
train end 0.63312649726

critic loss [0.068177141, 0.044452216]
critic loss [0.069725595, 0.046197813]
critic loss [0.067903191, 0.046787865]
critic loss [0.063021004, 0.045968726]
actor loss -213.516
train end 0.6340229511260986
should be -1.0, predicted: [-0.93510276], target predicted: [-0.72007632]
should be 1.0, predicted: [ 1.03292966], target predicted: [ 0.9389267]
should be 1.0, predicted: [ 0.97181422], target predicted: [ 0.70161098]
should be 1.0, predicted: [ 1.06865168], target predicted: [ 1.05914736]
should be 1.0, predicted: [ 0.70830101], target predicted: [ 0.40120441]
should be -1.0, predicted: [-0.86757952], target predicted: [-0.66713959]
start searching new action
get new action:  0.23450875282287598
start getting new_q
get new_q:  0.06624627113342285
train start:
critic loss [0.05822584, 0.044152584]
critic loss [0.054355554, 0.042681079]
critic loss [0.048615344, 0.040454701]
critic loss [0.042775624, 0.038127322]
critic loss [0.038162239, 0.036191359]
actor loss -102.459
train end 0.5

critic loss [0.058391735, 0.034709685]
critic loss [0.053028271, 0.033148222]
critic loss [0.046500444, 0.0313899]
critic loss [0.039959293, 0.02969972]
actor loss -39.6931
train end 0.5425269603729248
should be 1.0, predicted: [ 1.07344019], target predicted: [ 1.0116744]
should be 1.0, predicted: [ 0.62558556], target predicted: [ 0.30851284]
start searching new action
get new action:  0.23210978507995605
start getting new_q
get new_q:  0.06505990028381348
train start:
critic loss [0.035128899, 0.031525396]
critic loss [0.03434588, 0.031022167]
critic loss [0.033407982, 0.030566417]
critic loss [0.032363303, 0.030120853]
critic loss [0.031261496, 0.029650532]
actor loss -82.1625
train end 0.5484681129455566
should be 1.0, predicted: [ 0.87518257], target predicted: [ 0.76300579]
start searching new action
get new action:  0.22835612297058105
start getting new_q
get new_q:  0.063568115234375
train start:
critic loss [0.031323329, 0.03121279]
critic loss [0.031395588, 0.031219956]
crit

get new action:  0.24250435829162598
start getting new_q
get new_q:  0.06521940231323242
train start:
critic loss [0.039722204, 0.039593108]
critic loss [0.038320314, 0.03821107]
critic loss [0.036754601, 0.036683634]
critic loss [0.03514757, 0.035116293]
critic loss [0.033586215, 0.033580918]
actor loss -73.1112
train end 0.5261962413787842
should be 1.0, predicted: [ 1.05487645], target predicted: [ 0.87764102]
Episode : 212 Replay Buffer 20058
TOTAL REWARD @ 212-th Episode  : Reward -1
Total Step: 123
start searching new action
get new action:  0.2309398651123047
start getting new_q
get new_q:  0.06557035446166992
train start:
critic loss [0.086663961, 0.038441725]
critic loss [0.08405342, 0.037720963]
critic loss [0.079225279, 0.036787428]
critic loss [0.072913356, 0.035707336]
critic loss [0.065752298, 0.034514111]
actor loss -114.343
train end 0.6459841728210449
should be 1.0, predicted: [ 0.98644745], target predicted: [ 0.74351323]
should be 1.0, predicted: [ 0.5991959], target

critic loss [0.049114615, 0.031735998]
critic loss [0.046162836, 0.030901304]
critic loss [0.042322278, 0.029780827]
critic loss [0.038037997, 0.028492883]
actor loss -76.6717
train end 0.5056107044219971
should be 1.0, predicted: [ 0.89482474], target predicted: [ 0.7853905]
should be -1.0, predicted: [-0.86872613], target predicted: [-0.59504384]
should be 1.0, predicted: [ 0.66462934], target predicted: [ 0.48396957]
start searching new action
get new action:  0.22430658340454102
start getting new_q
get new_q:  0.06765103340148926
train start:
critic loss [0.041503694, 0.031472348]
critic loss [0.040641684, 0.031474542]
critic loss [0.038950548, 0.031239033]
critic loss [0.036713548, 0.030789364]
critic loss [0.03421247, 0.030136995]
actor loss -30.5023
train end 0.5017297267913818
should be 1.0, predicted: [ 0.9819541], target predicted: [ 0.7611407]
should be 1.0, predicted: [ 0.70087451], target predicted: [ 0.7351703]
Episode : 214 Replay Buffer 20305
TOTAL REWARD @ 214-th Episo

critic loss [0.048715871, 0.034132309]
critic loss [0.046142086, 0.033303455]
critic loss [0.042740528, 0.032243468]
critic loss [0.038951483, 0.031019406]
actor loss -128.973
train end 0.5306932926177979
should be 1.0, predicted: [ 0.74209696], target predicted: [ 0.44200656]
should be 1.0, predicted: [ 0.93167299], target predicted: [ 0.66827404]
should be 1.0, predicted: [ 0.75650936], target predicted: [ 0.3450025]
start searching new action
get new action:  0.22992944717407227
start getting new_q
get new_q:  0.06527304649353027
train start:
critic loss [0.045006145, 0.038558975]
critic loss [0.044118799, 0.038200811]
critic loss [0.04228396, 0.037333757]
critic loss [0.03986457, 0.036099911]
critic loss [0.037216444, 0.034651514]
actor loss -86.5848
train end 0.5511181354522705
should be 1.0, predicted: [ 0.7013492], target predicted: [ 0.54923904]
should be 1.0, predicted: [ 0.8633641], target predicted: [ 0.74965578]
Episode : 216 Replay Buffer 20462
TOTAL REWARD @ 216-th Episod

critic loss [0.04760249, 0.037621431]
critic loss [0.044536155, 0.036585771]
critic loss [0.041106559, 0.035304822]
critic loss [0.037734225, 0.033891123]
actor loss -0.740756
train end 0.51059889793396
should be 1.0, predicted: [ 0.79086262], target predicted: [ 0.38367164]
start searching new action
get new action:  0.2389063835144043
start getting new_q
get new_q:  0.06255698204040527
train start:
critic loss [0.065116078, 0.038312443]
critic loss [0.06185906, 0.037555315]
critic loss [0.056734689, 0.03645362]
critic loss [0.05072191, 0.035137929]
critic loss [0.044867426, 0.033717066]
actor loss -34.1435
train end 0.5080220699310303
should be 1.0, predicted: [ 0.95584983], target predicted: [ 1.08820462]
should be 1.0, predicted: [ 0.573452], target predicted: [ 0.20480049]
should be 1.0, predicted: [ 0.73553175], target predicted: [ 0.8413139]
start searching new action
get new action:  0.23323798179626465
start getting new_q
get new_q:  0.060373783111572266
train start:
critic lo

get new action:  0.23946261405944824
start getting new_q
get new_q:  0.06466555595397949
train start:
critic loss [0.10810529, 0.037939101]
critic loss [0.099262752, 0.037256181]
critic loss [0.086935394, 0.036123849]
critic loss [0.074117459, 0.034888871]
critic loss [0.062644988, 0.033708617]
actor loss -178.044
train end 0.5454940795898438
should be -1.0, predicted: [-0.36029944], target predicted: [-0.34700388]
should be 1.0, predicted: [ 0.9223305], target predicted: [ 0.55854994]
should be 1.0, predicted: [ 0.65040624], target predicted: [ 0.03399901]
should be 1.0, predicted: [ 0.67356485], target predicted: [ 0.35645521]
start searching new action
get new action:  0.23589301109313965
start getting new_q
get new_q:  0.06769585609436035
train start:
critic loss [0.045384474, 0.040892683]
critic loss [0.044153512, 0.040310308]
critic loss [0.042241037, 0.039046776]
critic loss [0.039916735, 0.03734374]
critic loss [0.037439339, 0.035458736]
actor loss -172.002
train end 0.52609348

critic loss [0.023208139, 0.022179525]
actor loss -112.189
train end 0.5075297355651855
should be 1.0, predicted: [ 0.84727776], target predicted: [ 0.68806309]
should be 1.0, predicted: [ 1.00859129], target predicted: [ 0.72727805]
start searching new action
get new action:  0.23905706405639648
start getting new_q
get new_q:  0.06897377967834473
train start:
critic loss [0.046865407, 0.028352402]
critic loss [0.044153668, 0.027929908]
critic loss [0.039784715, 0.02716044]
critic loss [0.034958199, 0.026249342]
critic loss [0.030703384, 0.025406037]
actor loss -31.1132
train end 0.6110997200012207
should be 1.0, predicted: [ 0.80493873], target predicted: [ 0.40777758]
should be 1.0, predicted: [ 1.14826369], target predicted: [ 0.8639304]
Episode : 221 Replay Buffer 20853
TOTAL REWARD @ 221-th Episode  : Reward -1
Total Step: 29
start searching new action
get new action:  0.24649500846862793
start getting new_q
get new_q:  0.06725740432739258
train start:
critic loss [0.040638682, 0.

should be 1.0, predicted: [ 0.90891755], target predicted: [ 0.95971507]
should be 1.0, predicted: [ 1.14702308], target predicted: [ 0.64798957]
start searching new action
get new action:  0.25020837783813477
start getting new_q
get new_q:  0.08175373077392578
train start:
critic loss [0.052743003, 0.037218578]
critic loss [0.049542367, 0.035810899]
critic loss [0.045303304, 0.034293767]
critic loss [0.040685073, 0.032788403]
critic loss [0.036341697, 0.031375781]
actor loss -234.864
train end 0.970545768737793
should be 1.0, predicted: [ 0.94354129], target predicted: [ 0.19095518]
should be 1.0, predicted: [ 1.08440268], target predicted: [ 0.53308034]
should be -1.0, predicted: [-0.7076757], target predicted: [-0.76424086]
should be 1.0, predicted: [ 1.0474292], target predicted: [ 0.44649428]
should be 1.0, predicted: [ 1.04060256], target predicted: [ 0.68290597]
start searching new action
get new action:  0.297274112701416
start getting new_q
get new_q:  0.07128548622131348
trai

critic loss [0.033785686, 0.033785686]
critic loss [0.032464474, 0.032464474]
critic loss [0.031070255, 0.031070255]
critic loss [0.029629625, 0.029629625]
actor loss -234.19
train end 0.5148169994354248
start searching new action
get new action:  0.2283482551574707
start getting new_q
get new_q:  0.07025766372680664
train start:
critic loss [0.045644592, 0.03773351]
critic loss [0.045857295, 0.037726905]
critic loss [0.044722952, 0.037280854]
critic loss [0.042511683, 0.036454231]
critic loss [0.039657287, 0.035333842]
actor loss -23.9196
train end 0.5402798652648926
should be 1.0, predicted: [ 0.60458034], target predicted: [ 1.00852334]
should be 1.0, predicted: [ 0.93678993], target predicted: [ 0.66722065]
start searching new action
get new action:  0.23421359062194824
start getting new_q
get new_q:  0.06478404998779297
train start:
critic loss [0.10561788, 0.039364662]
critic loss [0.099521026, 0.03894335]
critic loss [0.089792185, 0.038044661]
critic loss [0.078265622, 0.0369196

get new action:  0.24280643463134766
start getting new_q
get new_q:  0.07410264015197754
train start:
critic loss [0.033628941, 0.026965771]
critic loss [0.033169024, 0.026629673]
critic loss [0.03209047, 0.026115123]
critic loss [0.030567992, 0.02548055]
critic loss [0.028794717, 0.024770558]
actor loss -216.752
train end 0.5765933990478516
should be -1.0, predicted: [-0.72957414], target predicted: [-0.40733311]
should be -1.0, predicted: [-1.03310335], target predicted: [-0.90203625]
start searching new action
get new action:  0.2553832530975342
start getting new_q
get new_q:  0.06328415870666504
train start:
critic loss [0.063836537, 0.040056288]
critic loss [0.061604276, 0.039090179]
critic loss [0.057884272, 0.037887]
critic loss [0.053289995, 0.036602091]
critic loss [0.048432916, 0.035359055]
actor loss -226.802
train end 0.6730818748474121
should be -1.0, predicted: [-0.76625085], target predicted: [-0.55417687]
should be 1.0, predicted: [ 0.4222644], target predicted: [ 0.256

critic loss [0.056317851, 0.047136888]
critic loss [0.052086979, 0.044925544]
critic loss [0.047619294, 0.042656124]
critic loss [0.043562263, 0.04056083]
actor loss -101.095
train end 0.5789108276367188
should be -1.0, predicted: [-0.76700056], target predicted: [-0.63855344]
should be 1.0, predicted: [ 1.11880291], target predicted: [ 1.01267588]
should be 1.0, predicted: [ 1.00691414], target predicted: [ 0.61489654]
start searching new action
get new action:  0.24237918853759766
start getting new_q
get new_q:  0.06647849082946777
train start:
critic loss [0.085299797, 0.037831817]
critic loss [0.082990631, 0.037364237]
critic loss [0.076607823, 0.036104526]
critic loss [0.067604125, 0.034219772]
critic loss [0.05736611, 0.032014564]
actor loss -74.7256
train end 0.5698575973510742
should be 1.0, predicted: [ 0.98717022], target predicted: [ 0.91794294]
should be 1.0, predicted: [ 0.37725997], target predicted: [-0.16421716]
start searching new action
get new action:  0.241914272308

Episode : 230 Replay Buffer 21794
TOTAL REWARD @ 230-th Episode  : Reward -1
Total Step: 133
start searching new action
get new action:  0.2479703426361084
start getting new_q
get new_q:  0.06427764892578125
train start:
critic loss [0.10898572, 0.040976681]
critic loss [0.10233182, 0.039822996]
critic loss [0.089959934, 0.03818313]
critic loss [0.075243205, 0.036574263]
critic loss [0.061433185, 0.035421602]
actor loss -240.034
train end 0.5237658023834229
should be -1.0, predicted: [-0.76262796], target predicted: [-0.92222977]
should be 1.0, predicted: [ 0.8429842], target predicted: [ 0.58568102]
should be 1.0, predicted: [ 1.12090588], target predicted: [ 0.43359014]
should be 1.0, predicted: [ 0.5275926], target predicted: [ 0.36099482]
should be 1.0, predicted: [ 1.10412955], target predicted: [ 0.55642211]
start searching new action
get new action:  0.23349356651306152
start getting new_q
get new_q:  0.06154680252075195
train start:
critic loss [0.050383743, 0.03823898]
critic 

get new action:  0.251816987991333
start getting new_q
get new_q:  0.06889557838439941
train start:
critic loss [0.04654124, 0.039748147]
critic loss [0.045476634, 0.039674684]
critic loss [0.043963522, 0.039408937]
critic loss [0.042184129, 0.038922127]
critic loss [0.040262535, 0.038198866]
actor loss -220.977
train end 0.6182332038879395
should be -1.0, predicted: [-0.82947576], target predicted: [-0.91971838]
should be 1.0, predicted: [ 1.18340945], target predicted: [ 0.90129739]
should be 1.0, predicted: [ 0.85270464], target predicted: [ 0.74269038]
should be 1.0, predicted: [ 1.00627589], target predicted: [ 0.98739326]
Episode : 232 Replay Buffer 21937
TOTAL REWARD @ 232-th Episode  : Reward -1
Total Step: 137
start searching new action
get new action:  0.251420259475708
start getting new_q
get new_q:  0.0684652328491211
train start:
critic loss [0.043199152, 0.03535619]
critic loss [0.04236684, 0.035124451]
critic loss [0.040650144, 0.034624293]
critic loss [0.038409024, 0.03

should be 1.0, predicted: [ 0.9076733], target predicted: [ 0.76055115]
start searching new action
get new action:  0.24461793899536133
start getting new_q
get new_q:  0.06966638565063477
train start:
critic loss [0.074435703, 0.034337115]
critic loss [0.071955062, 0.034712352]
critic loss [0.066961356, 0.034663737]
critic loss [0.060139269, 0.034236282]
critic loss [0.052815549, 0.033551235]
actor loss -155.608
train end 0.5828475952148438
should be 1.0, predicted: [ 1.28795516], target predicted: [ 1.02066398]
should be -1.0, predicted: [-0.65542722], target predicted: [-0.56474292]
should be 1.0, predicted: [ 1.18147945], target predicted: [ 0.91176748]
should be 1.0, predicted: [ 1.01699138], target predicted: [ 0.8061133]
should be 1.0, predicted: [ 0.66756511], target predicted: [ 0.09635771]
should be 1.0, predicted: [ 1.14365423], target predicted: [ 1.00903594]
should be 1.0, predicted: [ 0.89830768], target predicted: [ 0.66595358]
start searching new action
get new action:  

get new action:  0.25418925285339355
start getting new_q
get new_q:  0.06847548484802246
train start:
critic loss [0.041773532, 0.033232257]
critic loss [0.039835215, 0.032310657]
critic loss [0.037022032, 0.031071153]
critic loss [0.033881016, 0.02971245]
critic loss [0.031003583, 0.028414067]
actor loss -171.798
train end 0.5355019569396973
should be -1.0, predicted: [-0.69144922], target predicted: [-0.63533741]
should be 1.0, predicted: [ 1.00739276], target predicted: [ 0.8724193]
start searching new action
get new action:  0.23772764205932617
start getting new_q
get new_q:  0.07099199295043945
train start:
critic loss [0.047425017, 0.031950314]
critic loss [0.045807984, 0.0310489]
critic loss [0.042579174, 0.029919401]
critic loss [0.038673364, 0.028768599]
critic loss [0.034786247, 0.027777499]
actor loss -77.3525
train end 0.4956941604614258
should be 1.0, predicted: [ 0.65845186], target predicted: [ 0.50340027]
should be 1.0, predicted: [ 1.00784683], target predicted: [ 0.89

actor loss -174.327
train end 1.1459901332855225
should be 1.0, predicted: [ 0.72544497], target predicted: [ 0.38939807]
should be 1.0, predicted: [ 1.09080005], target predicted: [ 1.10313952]
should be 1.0, predicted: [ 0.98515177], target predicted: [ 0.75272882]
start searching new action
get new action:  0.2668733596801758
start getting new_q
get new_q:  0.06654191017150879
train start:
critic loss [0.057226181, 0.031865396]
critic loss [0.05412446, 0.031428374]
critic loss [0.049150161, 0.030584818]
critic loss [0.043241791, 0.029491946]
critic loss [0.037506785, 0.028450508]
actor loss -181.494
train end 0.7394280433654785
should be 1.0, predicted: [ 0.94626307], target predicted: [ 0.74716067]
should be 1.0, predicted: [ 0.82826281], target predicted: [ 0.54590255]
start searching new action
get new action:  0.22530388832092285
start getting new_q
get new_q:  0.06844091415405273
train start:
critic loss [0.08957018, 0.040832467]
critic loss [0.083058625, 0.039469924]
critic lo

critic loss [0.029997477, 0.029989932]
actor loss -186.935
train end 0.5127177238464355
should be 1.0, predicted: [ 0.72162449], target predicted: [ 0.98558164]
Episode : 239 Replay Buffer 22959
TOTAL REWARD @ 239-th Episode  : Reward -1
Total Step: 109
start searching new action
get new action:  0.24492526054382324
start getting new_q
get new_q:  0.06398534774780273
train start:
critic loss [0.099822573, 0.039513633]
critic loss [0.094351299, 0.038936008]
critic loss [0.084778063, 0.038258795]
critic loss [0.073420972, 0.037675772]
critic loss [0.062498067, 0.037268814]
actor loss -180.548
train end 0.5193591117858887
should be 1.0, predicted: [ 0.86545914], target predicted: [ 0.78078943]
should be 1.0, predicted: [ 0.75426078], target predicted: [ 0.99009424]
should be 1.0, predicted: [ 0.10462825], target predicted: [ 0.19622932]
should be 1.0, predicted: [ 0.99340874], target predicted: [ 0.73505098]
should be 1.0, predicted: [ 1.00315118], target predicted: [ 0.95719069]
should b

get new action:  0.24145054817199707
start getting new_q
get new_q:  0.06444025039672852
train start:
critic loss [0.027667321, 0.027667321]
critic loss [0.027444314, 0.027444314]
critic loss [0.027029183, 0.027029183]
critic loss [0.026416529, 0.026416529]
critic loss [0.02564268, 0.02564268]
actor loss -189.795
train end 0.5223960876464844
start searching new action
get new action:  0.23456096649169922
start getting new_q
get new_q:  0.07682156562805176
train start:
critic loss [0.058540888, 0.033111162]
critic loss [0.057042111, 0.032831371]
critic loss [0.05376938, 0.032240301]
critic loss [0.049377427, 0.031410992]
critic loss [0.044501074, 0.030426225]
actor loss -50.3146
train end 0.749269962310791
should be 1.0, predicted: [ 0.98954362], target predicted: [ 0.84737974]
should be 1.0, predicted: [ 0.31141788], target predicted: [ 0.43158916]
start searching new action
get new action:  0.28327322006225586
start getting new_q
get new_q:  0.06897544860839844
train start:
critic los

get new action:  0.2359788417816162
start getting new_q
get new_q:  0.05986905097961426
train start:
critic loss [0.063249223, 0.030222485]
critic loss [0.058240566, 0.029905431]
critic loss [0.051550977, 0.02984692]
critic loss [0.045004025, 0.03018637]
critic loss [0.039907221, 0.030898392]
actor loss -227.336
train end 0.5272901058197021
should be -1.0, predicted: [-1.07315993], target predicted: [-0.85840887]
should be 1.0, predicted: [ 0.74391603], target predicted: [ 0.62290829]
should be 1.0, predicted: [ 0.41743439], target predicted: [ 0.28166884]
should be 1.0, predicted: [ 0.9422887], target predicted: [ 1.05468512]
should be 1.0, predicted: [ 0.50525188], target predicted: [ 0.43779373]
start searching new action
get new action:  0.23515534400939941
start getting new_q
get new_q:  0.061292409896850586
train start:
critic loss [0.035930999, 0.035001308]
critic loss [0.037280254, 0.03628426]
critic loss [0.037382133, 0.036448073]
critic loss [0.036292411, 0.035548355]
critic 

critic loss [0.061798163, 0.033182085]
critic loss [0.056038961, 0.031849209]
critic loss [0.049028553, 0.030257829]
critic loss [0.041815378, 0.02864705]
actor loss -1.91475
train end 0.4944303035736084
should be 1.0, predicted: [ 0.50801247], target predicted: [ 0.39444321]
should be 1.0, predicted: [ 0.84674394], target predicted: [ 0.89458108]
start searching new action
get new action:  0.23204803466796875
start getting new_q
get new_q:  0.0682988166809082
train start:
critic loss [0.061666727, 0.035855077]
critic loss [0.057833202, 0.03574723]
critic loss [0.052746303, 0.035577588]
critic loss [0.04735861, 0.035398118]
critic loss [0.042488784, 0.035204519]
actor loss -35.7251
train end 0.5014832019805908
should be 1.0, predicted: [ 0.77235663], target predicted: [ 0.53569609]
should be 1.0, predicted: [ 1.04982054], target predicted: [ 0.62162471]
should be 1.0, predicted: [ 0.95141149], target predicted: [ 0.74917477]
should be 1.0, predicted: [ 0.82734776], target predicted: [ 

get new action:  0.24194812774658203
start getting new_q
get new_q:  0.07626032829284668
train start:
critic loss [0.03816627, 0.035271451]
critic loss [0.037522197, 0.035274256]
critic loss [0.036751129, 0.035123467]
critic loss [0.03582485, 0.034763895]
critic loss [0.034809105, 0.034191217]
actor loss -90.3053
train end 0.6304874420166016
should be 1.0, predicted: [ 0.93524045], target predicted: [ 0.7588588]
should be 1.0, predicted: [ 0.89366519], target predicted: [ 0.92729062]
Episode : 246 Replay Buffer 23775
TOTAL REWARD @ 246-th Episode  : Reward -1
Total Step: 132
start searching new action
get new action:  0.26356053352355957
start getting new_q
get new_q:  0.0703737735748291
train start:
critic loss [0.054745279, 0.046025425]
critic loss [0.052078448, 0.045082405]
critic loss [0.048937015, 0.043618124]
critic loss [0.045665517, 0.041909181]
critic loss [0.042550933, 0.040129669]
actor loss -241.416
train end 0.783334493637085
should be -1.0, predicted: [-0.83296597], targe

critic loss [0.048638083, 0.035916448]
actor loss -49.6337
train end 0.520028829574585
should be 1.0, predicted: [ 1.30611682], target predicted: [ 1.30330515]
should be 1.0, predicted: [ 0.62176436], target predicted: [ 0.2459432]
start searching new action
get new action:  0.23981761932373047
start getting new_q
get new_q:  0.0673823356628418
train start:
critic loss [0.034334153, 0.034188405]
critic loss [0.033554979, 0.033365324]
critic loss [0.032567199, 0.032338258]
critic loss [0.031466313, 0.031210063]
critic loss [0.030306468, 0.030041199]
actor loss -225.915
train end 0.5027835369110107
should be 1.0, predicted: [ 0.87334961], target predicted: [ 0.92614698]
start searching new action
get new action:  0.23716425895690918
start getting new_q
get new_q:  0.06723976135253906
train start:
critic loss [0.043613143, 0.028475698]
critic loss [0.041691922, 0.028155345]
critic loss [0.038113642, 0.027133178]
critic loss [0.033717122, 0.025720948]
critic loss [0.029432463, 0.024287213]

critic loss [0.02679529, 0.023316661]
actor loss -95.4621
train end 0.5472190380096436
should be 1.0, predicted: [ 1.04952347], target predicted: [ 0.86938202]
should be 1.0, predicted: [ 0.70982879], target predicted: [ 0.78648216]
should be 1.0, predicted: [ 0.90863895], target predicted: [ 0.70826924]
should be 1.0, predicted: [ 0.9714542], target predicted: [ 1.01593864]
start searching new action
get new action:  0.23284387588500977
start getting new_q
get new_q:  0.06229543685913086
train start:
critic loss [0.058159303, 0.037187509]
critic loss [0.054700851, 0.036138181]
critic loss [0.049532324, 0.034688033]
critic loss [0.043920252, 0.033228066]
critic loss [0.038820915, 0.031966828]
actor loss -30.5147
train end 0.5137331485748291
should be 1.0, predicted: [ 0.89533603], target predicted: [ 0.62345546]
should be 1.0, predicted: [ 1.04670823], target predicted: [ 0.83216411]
should be 1.0, predicted: [ 0.92774904], target predicted: [ 0.8249225]
should be 1.0, predicted: [ 1.1

get new action:  0.22102785110473633
start getting new_q
get new_q:  0.0699460506439209
train start:
critic loss [0.045677193, 0.033075497]
critic loss [0.043235533, 0.032574188]
critic loss [0.038557425, 0.03150674]
critic loss [0.033289969, 0.030193241]
critic loss [0.029428815, 0.028978076]
actor loss -39.9863
train end 0.5035877227783203
should be 1.0, predicted: [ 1.09757817], target predicted: [ 1.15885496]
should be 1.0, predicted: [ 0.88392174], target predicted: [ 0.72083086]
start searching new action
get new action:  0.24444341659545898
start getting new_q
get new_q:  0.06464028358459473
train start:
critic loss [0.06133813, 0.033040054]
critic loss [0.060280629, 0.032595851]
critic loss [0.057090856, 0.03192088]
critic loss [0.052479804, 0.03104353]
critic loss [0.047253028, 0.030031428]
actor loss -61.8238
train end 0.4986701011657715
should be 1.0, predicted: [ 0.94605744], target predicted: [ 0.89960825]
should be 1.0, predicted: [ 0.94788259], target predicted: [ 0.6672

should be 1.0, predicted: [ 0.85304767], target predicted: [ 0.77477908]
should be 1.0, predicted: [ 0.993357], target predicted: [ 0.82215434]
start searching new action
get new action:  0.23319458961486816
start getting new_q
get new_q:  0.06125164031982422
train start:
critic loss [0.087059282, 0.051044349]
critic loss [0.081936643, 0.051670909]
critic loss [0.075154543, 0.05185983]
critic loss [0.067850672, 0.051681601]
critic loss [0.060824387, 0.051113136]
actor loss -186.201
train end 0.5140454769134521
should be 1.0, predicted: [ 0.49729279], target predicted: [ 0.11682954]
should be 1.0, predicted: [ 0.9682824], target predicted: [ 0.42975163]
start searching new action
get new action:  0.22844719886779785
start getting new_q
get new_q:  0.0640254020690918
train start:
critic loss [0.094345436, 0.039675716]
critic loss [0.092557505, 0.04051327]
critic loss [0.087492995, 0.040518895]
critic loss [0.080105767, 0.039794322]
critic loss [0.071488366, 0.038476542]
actor loss -165.3

critic loss [0.036450174, 0.033368811]
actor loss -62.1005
train end 0.6032693386077881
should be 1.0, predicted: [ 0.98214632], target predicted: [ 0.86931407]
should be 1.0, predicted: [ 0.7813701], target predicted: [ 0.85703754]
should be 1.0, predicted: [ 0.99981827], target predicted: [ 0.86398637]
start searching new action
get new action:  0.2551741600036621
start getting new_q
get new_q:  0.081268310546875
train start:
critic loss [0.061116178, 0.039863341]
critic loss [0.057962473, 0.03925002]
critic loss [0.053234212, 0.038336463]
critic loss [0.04787983, 0.03724312]
critic loss [0.042723689, 0.036065027]
actor loss -287.853
train end 0.6001787185668945
should be 1.0, predicted: [ 0.60589784], target predicted: [ 0.46061736]
start searching new action
get new action:  0.22730183601379395
start getting new_q
get new_q:  0.07157325744628906
train start:
critic loss [0.031681031, 0.030849358]
critic loss [0.031613834, 0.030883349]
critic loss [0.031164363, 0.030600177]
critic l

get new action:  0.22823476791381836
start getting new_q
get new_q:  0.06084918975830078
train start:
critic loss [0.040946491, 0.0324113]
critic loss [0.037712522, 0.03030107]
critic loss [0.034152325, 0.028194748]
critic loss [0.030851403, 0.026428496]
critic loss [0.028180588, 0.025179971]
actor loss -255.375
train end 0.5312299728393555
should be 1.0, predicted: [ 0.99920028], target predicted: [ 0.87710673]
should be 1.0, predicted: [ 0.93158686], target predicted: [ 0.58689505]
start searching new action
get new action:  0.2393174171447754
start getting new_q
get new_q:  0.06471538543701172
train start:
critic loss [0.057037182, 0.03774035]
critic loss [0.055649623, 0.037549064]
critic loss [0.052612364, 0.037219912]
critic loss [0.048587542, 0.036777858]
critic loss [0.044335768, 0.036276292]
actor loss -416.794
train end 0.5201156139373779
should be 1.0, predicted: [ 0.8957687], target predicted: [ 0.50995058]
should be 1.0, predicted: [ 0.77863801], target predicted: [ 0.34226

get new action:  0.24318718910217285
start getting new_q
get new_q:  0.06702303886413574
train start:
critic loss [0.070458256, 0.036237963]
critic loss [0.069586761, 0.036551457]
critic loss [0.067418724, 0.03674335]
critic loss [0.064139508, 0.036683246]
critic loss [0.060151182, 0.036321912]
actor loss -190.487
train end 0.5395188331604004
should be 1.0, predicted: [ 0.2437762], target predicted: [ 0.07300361]
start searching new action
get new action:  0.22702693939208984
start getting new_q
get new_q:  0.06398582458496094
train start:
critic loss [0.098047152, 0.043384593]
critic loss [0.09351588, 0.042642515]
critic loss [0.084927022, 0.041018829]
critic loss [0.074008733, 0.038874827]
critic loss [0.062652975, 0.036600545]
actor loss -125.976
train end 0.6023023128509521
should be 1.0, predicted: [ 0.42734477], target predicted: [-0.20340177]
should be 1.0, predicted: [ 0.9270165], target predicted: [ 0.98520976]
start searching new action
get new action:  0.23778533935546875
st

critic loss [0.03523194, 0.030260298]
critic loss [0.034892008, 0.031401079]
critic loss [0.034631848, 0.032354176]
critic loss [0.034262523, 0.032896049]
actor loss -250.273
train end 0.7234213352203369
should be 1.0, predicted: [ 0.93752253], target predicted: [ 0.45208237]
start searching new action
get new action:  0.24349713325500488
start getting new_q
get new_q:  0.06792211532592773
train start:
critic loss [0.055437081, 0.042408042]
critic loss [0.05396856, 0.0419681]
critic loss [0.050357211, 0.040387951]
critic loss [0.045526728, 0.038107283]
critic loss [0.040488686, 0.035618376]
actor loss -154.576
train end 0.5791823863983154
should be 1.0, predicted: [ 0.84335309], target predicted: [ 0.65168613]
should be 1.0, predicted: [ 0.80828083], target predicted: [ 0.98457336]
should be 1.0, predicted: [ 0.95426887], target predicted: [ 0.76952684]
start searching new action
get new action:  0.2366471290588379
start getting new_q
get new_q:  0.07715153694152832
train start:
critic

critic loss [0.06251967, 0.039186001]
critic loss [0.061054327, 0.039732881]
critic loss [0.057071753, 0.039426446]
critic loss [0.051688526, 0.038408343]
actor loss -290.813
train end 0.6495182514190674
should be -1.0, predicted: [-0.66088283], target predicted: [-0.39073443]
should be -1.0, predicted: [-0.91126335], target predicted: [-0.67557663]
should be 1.0, predicted: [ 0.8909694], target predicted: [ 0.70948178]
Episode : 261 Replay Buffer 25409
TOTAL REWARD @ 261-th Episode  : Reward -1
Total Step: 141
start searching new action
get new action:  0.24898052215576172
start getting new_q
get new_q:  0.06798601150512695
train start:
critic loss [0.050686762, 0.042412393]
critic loss [0.047041476, 0.040348452]
critic loss [0.042906836, 0.037964974]
critic loss [0.038904369, 0.03562035]
critic loss [0.035504028, 0.033595935]
actor loss -479.154
train end 0.5633606910705566
should be -1.0, predicted: [-1.30439496], target predicted: [-0.6778385]
should be 1.0, predicted: [ 0.79714084

get new action:  0.2429215908050537
start getting new_q
get new_q:  0.07247114181518555
train start:
critic loss [0.066435695, 0.051964272]
critic loss [0.065169759, 0.051507268]
critic loss [0.062192217, 0.050059207]
critic loss [0.058071684, 0.047881812]
critic loss [0.053329431, 0.045244075]
actor loss -331.514
train end 0.5729727745056152
should be 1.0, predicted: [ 0.5826692], target predicted: [ 0.39767179]
start searching new action
get new action:  0.2443547248840332
start getting new_q
get new_q:  0.0669703483581543
train start:
critic loss [0.030098967, 0.0256861]
critic loss [0.029076237, 0.025923546]
critic loss [0.027734011, 0.025922678]
critic loss [0.02634722, 0.025662243]
critic loss [0.02522734, 0.025172066]
actor loss -242.245
train end 0.543041467666626
should be 1.0, predicted: [ 0.58338088], target predicted: [ 0.90301371]
start searching new action
get new action:  0.23776578903198242
start getting new_q
get new_q:  0.07665419578552246
train start:
critic loss [0.

critic loss [0.043996237, 0.028788248]
critic loss [0.037958451, 0.027767677]
actor loss -292.283
train end 0.9574265480041504
should be 1.0, predicted: [ 0.84186804], target predicted: [ 0.60773826]
should be 1.0, predicted: [ 0.88201863], target predicted: [ 0.37078643]
start searching new action
get new action:  0.22786641120910645
start getting new_q
get new_q:  0.06814169883728027
train start:
critic loss [0.062956601, 0.034203842]
critic loss [0.056230746, 0.032683834]
critic loss [0.048242122, 0.031056028]
critic loss [0.040996931, 0.029867563]
critic loss [0.035608657, 0.029400777]
actor loss -59.0906
train end 0.5673539638519287
should be 1.0, predicted: [ 1.26396251], target predicted: [ 0.52127504]
should be 1.0, predicted: [ 0.70762318], target predicted: [ 0.43870515]
should be 1.0, predicted: [ 0.8459115], target predicted: [ 0.75716287]
start searching new action
get new action:  0.23572921752929688
start getting new_q
get new_q:  0.06945157051086426
train start:
critic 

critic loss [0.22027133, 0.054877289]
critic loss [0.18102971, 0.054882213]
critic loss [0.14261155, 0.057043888]
critic loss [0.11320037, 0.061820939]
actor loss -236.812
train end 0.66353440284729
should be 1.0, predicted: [ 0.30698383], target predicted: [-0.18794103]
should be 1.0, predicted: [ 0.97319233], target predicted: [ 0.34044296]
should be 1.0, predicted: [ 0.72667456], target predicted: [ 0.16512822]
should be 1.0, predicted: [ 0.67913634], target predicted: [ 0.09014434]
should be 1.0, predicted: [ 1.01747322], target predicted: [ 0.51368254]
should be 1.0, predicted: [ 1.21271384], target predicted: [ 0.94023848]
should be 1.0, predicted: [ 0.39887848], target predicted: [ 0.02152635]
should be 1.0, predicted: [ 0.63695866], target predicted: [-0.03579389]
should be 1.0, predicted: [ 0.51524192], target predicted: [ 0.37383389]
start searching new action
get new action:  0.24082493782043457
start getting new_q
get new_q:  0.0698091983795166
train start:
critic loss [0.0

get new action:  0.236372709274292
start getting new_q
get new_q:  0.1288008689880371
train start:
critic loss [0.045011908, 0.038031697]
critic loss [0.044272158, 0.037976839]
critic loss [0.04307938, 0.03767271]
critic loss [0.041516185, 0.037088349]
critic loss [0.039644435, 0.036192164]
actor loss -417.765
train end 0.6365180015563965
should be 1.0, predicted: [ 0.79686588], target predicted: [ 0.5632301]
Episode : 268 Replay Buffer 26129
TOTAL REWARD @ 268-th Episode  : Reward 1
Total Step: 143
start searching new action
get new action:  0.23655319213867188
start getting new_q
get new_q:  0.06740808486938477
train start:
critic loss [0.058356486, 0.040580668]
critic loss [0.057274058, 0.040308993]
critic loss [0.05522918, 0.039611038]
critic loss [0.052463844, 0.038569756]
critic loss [0.049267728, 0.037297036]
actor loss -218.725
train end 0.5527501106262207
should be -1.0, predicted: [-0.9629482], target predicted: [-1.21190536]
should be 1.0, predicted: [ 0.56067878], target pr

get new action:  0.2273397445678711
start getting new_q
get new_q:  0.06063985824584961
train start:
critic loss [0.074438438, 0.03769628]
critic loss [0.07287994, 0.037079014]
critic loss [0.069496796, 0.035887226]
critic loss [0.064894728, 0.034376711]
critic loss [0.059654333, 0.032816604]
actor loss -592.422
train end 0.5400078296661377
should be 1.0, predicted: [ 0.79579157], target predicted: [ 0.78808075]
should be 1.0, predicted: [ 0.19629394], target predicted: [ 0.20141286]
should be -1.0, predicted: [-0.74400061], target predicted: [-0.56434947]
start searching new action
get new action:  0.24145126342773438
start getting new_q
get new_q:  0.07035660743713379
train start:
critic loss [0.028085113, 0.028085113]
critic loss [0.027787864, 0.027787864]
critic loss [0.027388696, 0.027388696]
critic loss [0.026825968, 0.026825968]
critic loss [0.026072882, 0.026072882]
actor loss -332.362
train end 0.5159215927124023
trained action prob map predicted by initial model for a startin

critic loss [0.055639185, 0.03408841]
critic loss [0.052522194, 0.034068257]
critic loss [0.04849565, 0.033864081]
critic loss [0.044262312, 0.033526625]
actor loss 10.7002
train end 0.546602725982666
should be 1.0, predicted: [ 0.92296612], target predicted: [ 0.76976478]
should be 1.0, predicted: [ 1.04953802], target predicted: [ 1.09847236]
should be 1.0, predicted: [ 0.76577705], target predicted: [ 0.63127446]
should be 1.0, predicted: [ 0.77320886], target predicted: [ 0.6531871]
should be 1.0, predicted: [ 0.97610056], target predicted: [ 0.56856167]
should be -1.0, predicted: [-0.78845954], target predicted: [-0.59137297]
start searching new action
get new action:  0.2512388229370117
start getting new_q
get new_q:  0.07658100128173828
train start:
critic loss [0.10585494, 0.04721266]
critic loss [0.10037925, 0.046286225]
critic loss [0.088530615, 0.044087104]
critic loss [0.072652712, 0.041115168]
critic loss [0.057195351, 0.038136587]
actor loss -415.811
train end 0.610618829

Episode : 273 Replay Buffer 26656
TOTAL REWARD @ 273-th Episode  : Reward 1
Total Step: 108
start searching new action
get new action:  0.26134610176086426
start getting new_q
get new_q:  0.08167481422424316
train start:
critic loss [0.037400175, 0.030423027]
critic loss [0.037172258, 0.03037964]
critic loss [0.036328301, 0.030142814]
critic loss [0.035003114, 0.029736608]
critic loss [0.033384148, 0.029197389]
actor loss -381.62
train end 0.620833158493042
should be 1.0, predicted: [ 0.75076419], target predicted: [ 0.9095993]
should be -1.0, predicted: [-0.76264638], target predicted: [-0.763273]
start searching new action
get new action:  0.2580544948577881
start getting new_q
get new_q:  0.06651496887207031
train start:
critic loss [0.061939187, 0.036392871]
critic loss [0.059189118, 0.036002479]
critic loss [0.054782268, 0.035332274]
critic loss [0.049517255, 0.034519333]
critic loss [0.04420568, 0.033675633]
actor loss -310.774
train end 0.6019418239593506
should be -1.0, predict

get new action:  0.24818205833435059
start getting new_q
get new_q:  0.06821084022521973
train start:
critic loss [0.031472228, 0.028014934]
critic loss [0.031134052, 0.02783639]
critic loss [0.030451644, 0.027534114]
critic loss [0.0295147, 0.027121384]
critic loss [0.028447526, 0.026625693]
actor loss -453.227
train end 0.8241722583770752
should be 1.0, predicted: [ 0.86607385], target predicted: [ 0.83336651]
should be -1.0, predicted: [-1.04765868], target predicted: [-0.96613055]
start searching new action
get new action:  0.31675052642822266
start getting new_q
get new_q:  0.09869742393493652
train start:
critic loss [0.1326976, 0.041244641]
critic loss [0.12586661, 0.040393457]
critic loss [0.11132145, 0.038665444]
critic loss [0.092621662, 0.036547143]
critic loss [0.073342487, 0.034546416]
actor loss -375.861
train end 0.6800265312194824
should be 1.0, predicted: [ 0.43728158], target predicted: [ 0.06748903]
should be 1.0, predicted: [ 1.05288446], target predicted: [ 0.95487

critic loss [0.053333499, 0.031961843]
actor loss 5.23483
train end 0.6101715564727783
should be 1.0, predicted: [ 0.92748284], target predicted: [ 0.80853009]
should be 1.0, predicted: [ 1.01302361], target predicted: [ 1.02298594]
should be 1.0, predicted: [ 0.57355052], target predicted: [ 0.46301255]
should be 1.0, predicted: [ 0.33376408], target predicted: [ 0.29312995]
start searching new action
get new action:  0.24541187286376953
start getting new_q
get new_q:  0.0679624080657959
train start:
critic loss [0.041210383, 0.030632149]
critic loss [0.042200767, 0.031566493]
critic loss [0.041747637, 0.031862572]
critic loss [0.040102586, 0.031493012]
critic loss [0.03759785, 0.030524246]
actor loss -345.285
train end 0.7275011539459229
should be -1.0, predicted: [-0.88815773], target predicted: [-0.68001544]
should be 1.0, predicted: [ 0.76042831], target predicted: [ 0.71169925]
should be 1.0, predicted: [ 0.90171713], target predicted: [ 0.87507099]
should be -1.0, predicted: [-1

critic loss [0.071404561, 0.032058254]
actor loss -527.077
train end 0.5462729930877686
should be -1.0, predicted: [-0.85057938], target predicted: [-0.8316915]
should be 1.0, predicted: [ 0.89644819], target predicted: [ 0.71364903]
should be -1.0, predicted: [-0.24095412], target predicted: [ 0.06000518]
should be 1.0, predicted: [ 0.48311529], target predicted: [ 0.36925259]
start searching new action
get new action:  0.23678898811340332
start getting new_q
get new_q:  0.0686497688293457
train start:
critic loss [0.032944713, 0.032944713]
critic loss [0.033965394, 0.033965394]
critic loss [0.034904633, 0.034904633]
critic loss [0.035511907, 0.035511907]
critic loss [0.035606544, 0.035606544]
actor loss -141.258
train end 0.5627057552337646
start searching new action
get new action:  0.23293447494506836
start getting new_q
get new_q:  0.06914544105529785
train start:
critic loss [0.07457751, 0.042561404]
critic loss [0.071121573, 0.04154034]
critic loss [0.065245949, 0.039704502]
cri

Episode : 280 Replay Buffer 27601
TOTAL REWARD @ 280-th Episode  : Reward 1
Total Step: 147
start searching new action
get new action:  0.2455294132232666
start getting new_q
get new_q:  0.06522703170776367
train start:
critic loss [0.027767947, 0.027138069]
critic loss [0.028307548, 0.027554961]
critic loss [0.028640835, 0.027792677]
critic loss [0.028703097, 0.02780758]
critic loss [0.028469298, 0.027580053]
actor loss -366.107
train end 0.5377252101898193
should be 1.0, predicted: [ 0.89967477], target predicted: [ 0.91470218]
should be -1.0, predicted: [-0.71197903], target predicted: [-0.9215368]
start searching new action
get new action:  0.2252950668334961
start getting new_q
get new_q:  0.06281328201293945
train start:
critic loss [0.089735404, 0.045842584]
critic loss [0.08775337, 0.045362007]
critic loss [0.083471701, 0.044413768]
critic loss [0.077554658, 0.043142907]
critic loss [0.070788532, 0.04170198]
actor loss -319.871
train end 0.555283784866333
should be -1.0, predic

critic loss [0.041707061, 0.0411167]
critic loss [0.043457739, 0.043080322]
critic loss [0.044272967, 0.044022892]
critic loss [0.044047691, 0.043861978]
actor loss -20.993
train end 0.5569071769714355
should be 1.0, predicted: [ 0.89360517], target predicted: [ 0.53375286]
start searching new action
get new action:  0.23250532150268555
start getting new_q
get new_q:  0.06009054183959961
train start:
critic loss [0.068715222, 0.04752611]
critic loss [0.065809771, 0.045954265]
critic loss [0.060146801, 0.04312649]
critic loss [0.052894268, 0.039565776]
critic loss [0.045289993, 0.03580071]
actor loss -278.639
train end 0.5260746479034424
should be 1.0, predicted: [ 0.94957918], target predicted: [ 0.8393696]
should be -1.0, predicted: [-0.56381768], target predicted: [-0.67017454]
Episode : 282 Replay Buffer 27884
TOTAL REWARD @ 282-th Episode  : Reward 1
Total Step: 130
start searching new action
get new action:  0.23772573471069336
start getting new_q
get new_q:  0.0681304931640625
tr

get new action:  0.25240230560302734
start getting new_q
get new_q:  0.06888365745544434
train start:
critic loss [0.035030846, 0.032280311]
critic loss [0.034722373, 0.032814506]
critic loss [0.034658574, 0.033405937]
critic loss [0.03465873, 0.033894658]
critic loss [0.034571528, 0.034126796]
actor loss -340.257
train end 0.5233500003814697
should be -1.0, predicted: [-0.80056101], target predicted: [-0.58807862]
should be 1.0, predicted: [ 0.93097192], target predicted: [ 0.82976991]
start searching new action
get new action:  0.23720502853393555
start getting new_q
get new_q:  0.05987191200256348
train start:
critic loss [0.056318831, 0.046796676]
critic loss [0.054940324, 0.04686892]
critic loss [0.052473217, 0.046076544]
critic loss [0.049240284, 0.044534497]
critic loss [0.045627572, 0.042437535]
actor loss -578.863
train end 0.523338794708252
should be -1.0, predicted: [-0.68754452], target predicted: [-0.60056305]
should be 1.0, predicted: [ 0.97024047], target predicted: [ 1.

critic loss [0.056152001, 0.037507653]
critic loss [0.05462075, 0.03686706]
critic loss [0.051884331, 0.035984397]
critic loss [0.048281439, 0.034874201]
actor loss -712.947
train end 0.631173849105835
should be -1.0, predicted: [-1.24217772], target predicted: [-1.12516582]
should be 1.0, predicted: [ 0.81955463], target predicted: [ 0.72339326]
should be 1.0, predicted: [ 0.75721031], target predicted: [ 0.79347444]
should be 1.0, predicted: [ 0.33956856], target predicted: [ 0.41722754]
start searching new action
get new action:  0.24152159690856934
start getting new_q
get new_q:  0.0646657943725586
train start:
critic loss [0.039484583, 0.03517928]
critic loss [0.038193688, 0.034313239]
critic loss [0.036375958, 0.033162668]
critic loss [0.034293685, 0.031851962]
critic loss [0.032204136, 0.030505117]
actor loss -408.576
train end 0.5746166706085205
should be 1.0, predicted: [ 0.8040368], target predicted: [ 0.70769721]
should be 1.0, predicted: [ 0.67924291], target predicted: [ 0

critic loss [0.037019413, 0.037019413]
critic loss [0.037095085, 0.037095085]
critic loss [0.036728915, 0.036728915]
critic loss [0.035981711, 0.035981711]
actor loss -498.692
train end 0.5603511333465576
start searching new action
get new action:  0.23825621604919434
start getting new_q
get new_q:  0.06683611869812012
train start:
critic loss [0.069540501, 0.038790345]
critic loss [0.065439403, 0.037704043]
critic loss [0.059960186, 0.0363786]
critic loss [0.053971376, 0.035079632]
critic loss [0.048259024, 0.033981107]
actor loss -324.889
train end 0.5588624477386475
should be 1.0, predicted: [ 0.98703951], target predicted: [ 0.73650271]
should be -1.0, predicted: [-0.88946885], target predicted: [-0.67615288]
should be -1.0, predicted: [-0.71919537], target predicted: [-0.60291725]
should be 1.0, predicted: [ 0.65056282], target predicted: [ 0.49993718]
start searching new action
get new action:  0.22798514366149902
start getting new_q
get new_q:  0.06338214874267578
train start:
c

critic loss [0.044824854, 0.026838562]
actor loss -248.245
train end 0.5061616897583008
should be 1.0, predicted: [ 0.3860375], target predicted: [ 0.26499465]
start searching new action
get new action:  0.23587322235107422
start getting new_q
get new_q:  0.07662844657897949
train start:
critic loss [0.028725121, 0.028714342]
critic loss [0.029842358, 0.02983826]
critic loss [0.030614808, 0.03061451]
critic loss [0.030977279, 0.030976333]
critic loss [0.030881487, 0.030875774]
actor loss -355.457
train end 0.6239707469940186
should be 1.0, predicted: [ 0.92409968], target predicted: [ 0.95287931]
start searching new action
get new action:  0.26236963272094727
start getting new_q
get new_q:  0.06604123115539551
train start:
critic loss [0.036862813, 0.034251377]
critic loss [0.035889298, 0.033598542]
critic loss [0.034203455, 0.032451481]
critic loss [0.032144841, 0.030992884]
critic loss [0.030035768, 0.029409636]
actor loss -225.04
train end 0.565474271774292
should be 1.0, predicted:

critic loss [0.039915904, 0.029166371]
critic loss [0.036714509, 0.028143153]
critic loss [0.033003829, 0.027024947]
critic loss [0.029532416, 0.026009213]
actor loss -486.113
train end 0.5065827369689941
should be 1.0, predicted: [ 0.66813898], target predicted: [ 0.45204049]
should be 1.0, predicted: [ 0.97826076], target predicted: [ 0.88148326]
start searching new action
get new action:  0.23876595497131348
start getting new_q
get new_q:  0.0644228458404541
train start:
critic loss [0.027642351, 0.027007431]
critic loss [0.02771512, 0.026901802]
critic loss [0.027606478, 0.026649211]
critic loss [0.027243044, 0.026192673]
critic loss [0.026600292, 0.025517438]
actor loss -401.78
train end 0.9137868881225586
should be 1.0, predicted: [ 0.83888227], target predicted: [ 0.77146691]
start searching new action
get new action:  0.24694108963012695
start getting new_q
get new_q:  0.06501889228820801
train start:
critic loss [0.032777514, 0.026197949]
critic loss [0.031882271, 0.02592634]


critic loss [0.028460981, 0.023085687]
actor loss -147.045
train end 0.5576279163360596
should be 1.0, predicted: [ 0.60420704], target predicted: [ 0.27836233]
start searching new action
get new action:  0.23702430725097656
start getting new_q
get new_q:  0.0630342960357666
train start:
critic loss [0.059327982, 0.032714054]
critic loss [0.055724751, 0.032812364]
critic loss [0.050709948, 0.03290819]
critic loss [0.045511462, 0.033098333]
critic loss [0.041040964, 0.033396885]
actor loss -819.659
train end 0.5189316272735596
should be 1.0, predicted: [ 0.72343409], target predicted: [ 0.05707486]
should be 1.0, predicted: [ 0.68885839], target predicted: [ 0.46256143]
start searching new action
get new action:  0.2334740161895752
start getting new_q
get new_q:  0.07023239135742188
train start:
critic loss [0.063151248, 0.036379762]
critic loss [0.060129568, 0.037050277]
critic loss [0.055852726, 0.037464973]
critic loss [0.051100224, 0.037656784]
critic loss [0.046599485, 0.037654437]

get new action:  0.2375943660736084
start getting new_q
get new_q:  0.0653390884399414
train start:
critic loss [0.065118678, 0.03153792]
critic loss [0.061700083, 0.030989004]
critic loss [0.056120753, 0.030102659]
critic loss [0.049476966, 0.02905724]
critic loss [0.04282986, 0.028059978]
actor loss -248.615
train end 0.5363993644714355
should be 1.0, predicted: [ 0.43702617], target predicted: [ 0.27828148]
start searching new action
get new action:  0.23938870429992676
start getting new_q
get new_q:  0.06658363342285156
train start:
critic loss [0.043384224, 0.035385713]
critic loss [0.042590503, 0.035088249]
critic loss [0.041444808, 0.03459733]
critic loss [0.039939862, 0.033937335]
critic loss [0.038055927, 0.033114187]
actor loss -264.509
train end 0.500873327255249
should be 1.0, predicted: [ 1.15808094], target predicted: [ 0.90655118]
should be 1.0, predicted: [ 0.7104705], target predicted: [ 0.47032255]
should be 1.0, predicted: [ 1.06994915], target predicted: [ 0.9410012

get new action:  0.22893190383911133
start getting new_q
get new_q:  0.0632932186126709
train start:
critic loss [0.044685252, 0.034290899]
critic loss [0.04333438, 0.03396409]
critic loss [0.041320402, 0.033387762]
critic loss [0.038967282, 0.032650307]
critic loss [0.036527157, 0.031821474]
actor loss -272.717
train end 0.5572571754455566
should be 1.0, predicted: [ 0.46578968], target predicted: [ 0.11533774]
start searching new action
get new action:  0.22779417037963867
start getting new_q
get new_q:  0.07249855995178223
train start:
critic loss [0.032129437, 0.031831052]
critic loss [0.032026507, 0.03166173]
critic loss [0.031533219, 0.031119447]
critic loss [0.030693477, 0.03026022]
critic loss [0.029570417, 0.029153638]
actor loss -248.266
train end 0.5438125133514404
should be 1.0, predicted: [ 0.75406635], target predicted: [ 0.08227251]
should be 1.0, predicted: [ 0.93905628], target predicted: [ 0.95385116]
should be 1.0, predicted: [ 0.93181527], target predicted: [ 1.0587

critic loss [0.084465474, 0.037186578]
critic loss [0.07576821, 0.035226237]
critic loss [0.065021507, 0.033180669]
critic loss [0.054384835, 0.031463165]
actor loss -327.048
train end 0.5773344039916992
should be 1.0, predicted: [ 1.01913941], target predicted: [ 0.7465924]
should be 1.0, predicted: [ 0.68229645], target predicted: [ 0.20796317]
should be -1.0, predicted: [-0.50380909], target predicted: [-0.83938771]
should be 1.0, predicted: [ 1.03636014], target predicted: [ 0.9513405]
should be 1.0, predicted: [ 0.92691714], target predicted: [ 0.61988002]
should be 1.0, predicted: [ 0.93631756], target predicted: [ 0.39833874]
start searching new action
get new action:  0.23591995239257812
start getting new_q
get new_q:  0.06314563751220703
train start:
critic loss [0.054401904, 0.03742557]
critic loss [0.052611597, 0.038351245]
critic loss [0.049138542, 0.038612559]
critic loss [0.045085117, 0.038309034]
critic loss [0.041332789, 0.037545092]
actor loss -118.215
train end 0.5334

Episode : 300 Replay Buffer 30069
TOTAL REWARD @ 300-th Episode  : Reward 1
Total Step: 118
start searching new action
get new action:  0.2571899890899658
start getting new_q
get new_q:  0.07332372665405273
train start:
critic loss [0.028404184, 0.028317913]
critic loss [0.028377606, 0.028251043]
critic loss [0.028056119, 0.027920783]
critic loss [0.027473904, 0.027362103]
critic loss [0.026691761, 0.026621949]
actor loss -424.872
train end 0.5860207080841064
should be 1.0, predicted: [ 0.9741919], target predicted: [ 0.94139516]
start searching new action
get new action:  0.23194599151611328
start getting new_q
get new_q:  0.07433104515075684
train start:
critic loss [0.097584538, 0.039715789]
critic loss [0.093554698, 0.038770929]
critic loss [0.084241152, 0.036683735]
critic loss [0.072283238, 0.034126595]
critic loss [0.06070739, 0.03186027]
actor loss -297.349
train end 0.7004868984222412
should be -1.0, predicted: [-0.28214878], target predicted: [-0.09292057]
should be -1.0, pre

critic loss [0.13736905, 0.049553007]
critic loss [0.12824102, 0.048018999]
critic loss [0.11607917, 0.045866743]
critic loss [0.10259438, 0.043390557]
actor loss 415.493
train end 0.6044101715087891
should be -1.0, predicted: [-0.17027517], target predicted: [ 0.0138095]
should be 1.0, predicted: [ 0.26191679], target predicted: [ 0.31894633]
should be 1.0, predicted: [ 0.7442801], target predicted: [ 0.68325144]
start searching new action
get new action:  0.22765588760375977
start getting new_q
get new_q:  0.06601762771606445
train start:
critic loss [0.029948594, 0.028330293]
critic loss [0.028783904, 0.027502635]
critic loss [0.027703967, 0.02665114]
critic loss [0.026787408, 0.025839956]
critic loss [0.026025791, 0.025097085]
actor loss 573.848
train end 0.5925507545471191
should be 1.0, predicted: [ 0.94393611], target predicted: [ 0.88941503]
should be 1.0, predicted: [ 0.82549882], target predicted: [ 0.8314234]
should be 1.0, predicted: [ 0.96114188], target predicted: [ 0.989

critic loss [0.088712968, 0.032300577]
critic loss [0.077617437, 0.030750286]
critic loss [0.064112954, 0.028969917]
critic loss [0.050903831, 0.027452828]
actor loss 601.749
train end 0.5430123805999756
should be 1.0, predicted: [ 0.6554476], target predicted: [ 0.00413273]
should be 1.0, predicted: [ 1.00223053], target predicted: [ 0.48050886]
start searching new action
get new action:  0.22341227531433105
start getting new_q
get new_q:  0.0670320987701416
train start:
critic loss [0.074926347, 0.031646986]
critic loss [0.075422898, 0.032593161]
critic loss [0.072802365, 0.032710392]
critic loss [0.067473695, 0.03198608]
critic loss [0.060287368, 0.030618705]
actor loss 532.304
train end 0.5993993282318115
should be -1.0, predicted: [-0.3652471], target predicted: [-0.30843142]
should be 1.0, predicted: [ 0.45247415], target predicted: [ 0.24522246]
start searching new action
get new action:  0.2377769947052002
start getting new_q
get new_q:  0.06524157524108887
train start:
critic 

critic loss [0.030578248, 0.022858324]
critic loss [0.029518463, 0.022668755]
critic loss [0.027881585, 0.022231882]
critic loss [0.025917407, 0.02162376]
actor loss 515.822
train end 0.5312488079071045
should be 1.0, predicted: [ 0.76358598], target predicted: [ 0.77587384]
should be 1.0, predicted: [ 0.84865457], target predicted: [ 0.58031243]
start searching new action
get new action:  0.23088455200195312
start getting new_q
get new_q:  0.06184244155883789
train start:
critic loss [0.042819932, 0.031072985]
critic loss [0.041174795, 0.030720726]
critic loss [0.038623258, 0.030132188]
critic loss [0.035625838, 0.02938555]
critic loss [0.032625325, 0.028549064]
actor loss 404.672
train end 0.5318083763122559
should be 1.0, predicted: [ 0.74064296], target predicted: [ 0.4341152]
should be 1.0, predicted: [ 0.88888967], target predicted: [ 0.78488475]
start searching new action
get new action:  0.23351573944091797
start getting new_q
get new_q:  0.06439709663391113
train start:
critic

get new action:  0.2241051197052002
start getting new_q
get new_q:  0.06904411315917969
train start:
critic loss [0.054774091, 0.042953387]
critic loss [0.05086486, 0.040800396]
critic loss [0.045404814, 0.037632465]
critic loss [0.039476279, 0.034130819]
critic loss [0.034089193, 0.03088952]
actor loss 648.769
train end 0.5812056064605713
should be 1.0, predicted: [ 1.16317117], target predicted: [ 0.82434553]
should be 1.0, predicted: [ 1.23541319], target predicted: [ 0.75636178]
should be -1.0, predicted: [-0.70636344], target predicted: [-0.6275509]
start searching new action
get new action:  0.2383434772491455
start getting new_q
get new_q:  0.06337261199951172
train start:
critic loss [0.072651714, 0.026624333]
critic loss [0.068015404, 0.027138859]
critic loss [0.062241092, 0.027725277]
critic loss [0.055902433, 0.028125029]
critic loss [0.049414694, 0.028147906]
actor loss 520.638
train end 0.5292830467224121
should be 1.0, predicted: [ 0.83934504], target predicted: [ 0.60626

critic loss [0.04904674, 0.031833164]
critic loss [0.04634738, 0.030214122]
critic loss [0.04208032, 0.028119903]
critic loss [0.037054125, 0.02591835]
actor loss 454.883
train end 0.5311942100524902
should be 1.0, predicted: [ 0.36075085], target predicted: [ 0.71567619]
should be 1.0, predicted: [ 0.88479292], target predicted: [ 0.83494747]
start searching new action
get new action:  0.2408134937286377
start getting new_q
get new_q:  0.06681632995605469
train start:
critic loss [0.066232547, 0.032366578]
critic loss [0.060014378, 0.030944906]
critic loss [0.052748691, 0.029496446]
critic loss [0.045621723, 0.02819089]
critic loss [0.039356366, 0.027087007]
actor loss 412.174
train end 0.5263314247131348
should be 1.0, predicted: [ 0.43649444], target predicted: [ 0.45165032]
should be 1.0, predicted: [ 0.93988389], target predicted: [ 0.89926106]
should be -1.0, predicted: [-1.01025808], target predicted: [-0.93991333]
should be 1.0, predicted: [ 1.0423696], target predicted: [ 1.16

critic loss [0.057349961, 0.031154063]
critic loss [0.051897757, 0.030165732]
critic loss [0.045523517, 0.029000856]
critic loss [0.03929732, 0.027924497]
actor loss 458.271
train end 0.48964619636535645
should be 1.0, predicted: [ 0.96934807], target predicted: [ 0.69376725]
should be -1.0, predicted: [-0.5089218], target predicted: [-0.52629715]
should be 1.0, predicted: [ 0.86262029], target predicted: [ 0.61912191]
should be 1.0, predicted: [ 1.12038136], target predicted: [ 1.03564727]
start searching new action
get new action:  0.23510956764221191
start getting new_q
get new_q:  0.06408476829528809
train start:
critic loss [0.069366768, 0.030884109]
critic loss [0.064527944, 0.030854164]
critic loss [0.055979796, 0.030074911]
critic loss [0.046071418, 0.028914088]
critic loss [0.037153028, 0.027705092]
actor loss 74.914
train end 0.5141243934631348
should be -1.0, predicted: [-0.94207746], target predicted: [-0.31672502]
should be 1.0, predicted: [ 0.93151683], target predicted: 

critic loss [0.17690274, 0.041332569]
critic loss [0.14896452, 0.038182411]
critic loss [0.11829858, 0.035260022]
critic loss [0.089526147, 0.03325988]
actor loss 45.6006
train end 0.5153541564941406
should be -1.0, predicted: [-0.72159827], target predicted: [-0.36842647]
should be 1.0, predicted: [ 0.67370099], target predicted: [ 0.65215707]
should be -1.0, predicted: [-0.82252884], target predicted: [-0.5086239]
should be -1.0, predicted: [-0.16678184], target predicted: [ 0.07719596]
start searching new action
get new action:  0.23383498191833496
start getting new_q
get new_q:  0.06574416160583496
train start:
critic loss [0.032547057, 0.032547057]
critic loss [0.03398294, 0.03398294]
critic loss [0.035114944, 0.035114944]
critic loss [0.035628255, 0.035628255]
critic loss [0.035366617, 0.035366617]
actor loss 193.598
train end 0.4970376491546631
start searching new action
get new action:  0.22333455085754395
start getting new_q
get new_q:  0.06725025177001953
train start:
critic 

critic loss [0.029949192, 0.027925078]
actor loss 463.261
train end 0.5752863883972168
should be 1.0, predicted: [ 1.13455343], target predicted: [ 0.93933535]
should be 1.0, predicted: [ 0.83861214], target predicted: [ 0.85621005]
should be 1.0, predicted: [ 1.23076475], target predicted: [ 0.96802598]
should be 1.0, predicted: [ 1.00170946], target predicted: [ 0.70459485]
start searching new action
get new action:  0.24342632293701172
start getting new_q
get new_q:  0.08276104927062988
train start:
critic loss [0.031727176, 0.026896073]
critic loss [0.030263539, 0.025979562]
critic loss [0.028358452, 0.024964042]
critic loss [0.026288424, 0.023921376]
critic loss [0.024319828, 0.022913802]
actor loss 371.22
train end 0.6486780643463135
should be 1.0, predicted: [ 0.90741211], target predicted: [ 0.98809344]
should be 1.0, predicted: [ 0.91059548], target predicted: [ 0.8619808]
should be -1.0, predicted: [-0.74387193], target predicted: [-0.85412955]
start searching new action
get 

get new action:  0.23450398445129395
start getting new_q
get new_q:  0.06571674346923828
train start:
critic loss [0.07442373, 0.030228009]
critic loss [0.07217586, 0.029768413]
critic loss [0.067620769, 0.029203497]
critic loss [0.061566427, 0.028609095]
critic loss [0.054834608, 0.02806123]
actor loss 167.021
train end 0.887873649597168
should be 1.0, predicted: [ 1.12652588], target predicted: [ 0.93068105]
should be 1.0, predicted: [ 0.64231342], target predicted: [ 0.53644419]
should be 1.0, predicted: [ 0.46359211], target predicted: [ 0.08620308]
start searching new action
get new action:  0.2487046718597412
start getting new_q
get new_q:  0.0838770866394043
train start:
critic loss [0.030477146, 0.030477146]
critic loss [0.030894682, 0.030894682]
critic loss [0.031011242, 0.031011242]
critic loss [0.030836934, 0.030836934]
critic loss [0.030415086, 0.030415086]
actor loss 293.928
train end 0.5583937168121338
start searching new action
get new action:  0.22925353050231934
start 

critic loss [0.031696994, 0.024437189]
critic loss [0.028673002, 0.023399599]
actor loss 270.289
train end 0.629429817199707
should be 1.0, predicted: [ 0.60120845], target predicted: [ 0.48699388]
Episode : 318 Replay Buffer 32046
TOTAL REWARD @ 318-th Episode  : Reward -1
Total Step: 103
start searching new action
get new action:  0.24060750007629395
start getting new_q
get new_q:  0.06869125366210938
train start:
critic loss [0.086536229, 0.037235558]
critic loss [0.082370669, 0.036608517]
critic loss [0.074179947, 0.035694927]
critic loss [0.064548939, 0.034741934]
critic loss [0.054661416, 0.033845119]
actor loss 397.659
train end 0.5554606914520264
should be 1.0, predicted: [ 0.76568925], target predicted: [ 0.86044347]
should be 1.0, predicted: [ 1.09172261], target predicted: [ 0.94454366]
should be 1.0, predicted: [ 0.65043765], target predicted: [ 0.33170182]
should be 1.0, predicted: [ 0.72136706], target predicted: [ 0.18835072]
should be 1.0, predicted: [ 0.97352499], targ

get new action:  0.2311544418334961
start getting new_q
get new_q:  0.06510019302368164
train start:
critic loss [0.028916836, 0.023087805]
critic loss [0.02815146, 0.022738153]
critic loss [0.02714755, 0.022444513]
critic loss [0.02591119, 0.02211676]
critic loss [0.024529915, 0.021695998]
actor loss 132.816
train end 0.5288221836090088
should be 1.0, predicted: [ 0.83193815], target predicted: [ 0.82651895]
should be 1.0, predicted: [ 0.72210735], target predicted: [ 0.61424541]
start searching new action
get new action:  0.23122215270996094
start getting new_q
get new_q:  0.06658935546875
train start:
critic loss [0.032111414, 0.032111414]
critic loss [0.031743236, 0.031743236]
critic loss [0.031130146, 0.031130146]
critic loss [0.030342691, 0.030342691]
critic loss [0.029450381, 0.029450381]
actor loss 141.407
train end 0.5308668613433838
trained action prob map predicted by initial model for a starting game
[[[  1.73847470e-09   1.45409951e-09   1.45408563e-09   1.45716750e-09
   

get new action:  0.2336719036102295
start getting new_q
get new_q:  0.06563520431518555
train start:
critic loss [0.047683384, 0.03041774]
critic loss [0.045237917, 0.029781224]
critic loss [0.041816458, 0.029024124]
critic loss [0.038131759, 0.028311256]
critic loss [0.034780741, 0.027755328]
actor loss 167.669
train end 0.719498872756958
should be -1.0, predicted: [-1.04841268], target predicted: [-1.00934017]
should be 1.0, predicted: [ 0.85089612], target predicted: [ 0.8112309]
should be 1.0, predicted: [ 0.95194197], target predicted: [ 0.92874616]
should be 1.0, predicted: [ 0.96086341], target predicted: [ 1.21404564]
should be -1.0, predicted: [-0.67774957], target predicted: [-0.32095057]
start searching new action
get new action:  0.31886720657348633
start getting new_q
get new_q:  0.08383965492248535
train start:
critic loss [0.037182409, 0.027975399]
critic loss [0.035235971, 0.027943933]
critic loss [0.033330634, 0.027977377]
critic loss [0.031659871, 0.028042408]
critic 

get new action:  0.23127985000610352
start getting new_q
get new_q:  0.06319212913513184
train start:
critic loss [0.083752997, 0.025024734]
critic loss [0.078450628, 0.024202861]
critic loss [0.069838181, 0.023042776]
critic loss [0.059474342, 0.021722803]
critic loss [0.048799653, 0.020408049]
actor loss -47.0972
train end 0.5365972518920898
should be -1.0, predicted: [-0.82040411], target predicted: [-0.66331154]
should be 1.0, predicted: [ 0.08664574], target predicted: [ 0.1227289]
should be -1.0, predicted: [-0.90804976], target predicted: [-0.42799693]
start searching new action
get new action:  0.2528085708618164
start getting new_q
get new_q:  0.07080221176147461
train start:
critic loss [0.052653246, 0.028660031]
critic loss [0.051585153, 0.028219357]
critic loss [0.048076801, 0.027129954]
critic loss [0.042905934, 0.02562313]
critic loss [0.036983293, 0.02396347]
actor loss 386.044
train end 0.6531171798706055
should be 1.0, predicted: [ 0.75475991], target predicted: [ 0.68

get new action:  0.2509450912475586
start getting new_q
get new_q:  0.06429147720336914
train start:
critic loss [0.048687309, 0.024910266]
critic loss [0.046551384, 0.024357636]
critic loss [0.043151468, 0.023636617]
critic loss [0.038960725, 0.022833258]
critic loss [0.034510233, 0.022043962]
actor loss 78.791
train end 0.5144715309143066
should be 1.0, predicted: [ 1.09238946], target predicted: [ 1.20173001]
should be -1.0, predicted: [-0.64213365], target predicted: [-0.60749447]
start searching new action
get new action:  0.23267745971679688
start getting new_q
get new_q:  0.08072686195373535
train start:
critic loss [0.028065983, 0.028065983]
critic loss [0.027351972, 0.027351972]
critic loss [0.026609544, 0.026609544]
critic loss [0.025873497, 0.025873497]
critic loss [0.025164906, 0.025164906]
actor loss -141.083
train end 0.5293192863464355
start searching new action
get new action:  0.2364959716796875
start getting new_q
get new_q:  0.0651392936706543
train start:
critic los

actor loss 182.845
train end 0.9609873294830322
should be -1.0, predicted: [-1.15894651], target predicted: [-1.57897234]
should be 1.0, predicted: [ 0.79905301], target predicted: [ 0.27031603]
start searching new action
get new action:  0.26809167861938477
start getting new_q
get new_q:  0.07055354118347168
train start:
critic loss [0.049944602, 0.031613674]
critic loss [0.049042575, 0.031707846]
critic loss [0.045880467, 0.031012259]
critic loss [0.041258812, 0.029739996]
critic loss [0.036158755, 0.028190944]
actor loss 213.814
train end 0.5903475284576416
should be -1.0, predicted: [-0.37376803], target predicted: [-0.73390937]
should be 1.0, predicted: [ 1.24006891], target predicted: [ 0.7985509]
should be 1.0, predicted: [ 1.13773894], target predicted: [ 1.08162796]
Episode : 327 Replay Buffer 32889
TOTAL REWARD @ 327-th Episode  : Reward 1
Total Step: 122
start searching new action
get new action:  0.25098299980163574
start getting new_q
get new_q:  0.07670974731445312
train 

get new action:  0.226820707321167
start getting new_q
get new_q:  0.06451201438903809
train start:
critic loss [0.030360637, 0.024743568]
critic loss [0.032632273, 0.026389964]
critic loss [0.034042232, 0.027598362]
critic loss [0.034361001, 0.02820912]
critic loss [0.03357859, 0.028178373]
actor loss 168.942
train end 0.5608186721801758
should be 1.0, predicted: [ 1.01361454], target predicted: [ 0.45602968]
should be -1.0, predicted: [-0.63206774], target predicted: [-0.98721212]
should be -1.0, predicted: [-0.58545941], target predicted: [-0.47443092]
start searching new action
get new action:  0.2335662841796875
start getting new_q
get new_q:  0.06233406066894531
train start:
critic loss [0.06547448, 0.039338622]
critic loss [0.061086331, 0.037818376]
critic loss [0.054080606, 0.035341274]
critic loss [0.045859966, 0.032521635]
critic loss [0.038096812, 0.02995318]
actor loss -13.5951
train end 0.5713515281677246
should be -1.0, predicted: [-0.47859418], target predicted: [-0.6394

get new action:  0.2268083095550537
start getting new_q
get new_q:  0.06732654571533203
train start:
critic loss [0.030140059, 0.025561431]
critic loss [0.029429365, 0.025158389]
critic loss [0.0284147, 0.024675775]
critic loss [0.027210418, 0.02414149]
critic loss [0.025947997, 0.023601256]
actor loss 118.997
train end 0.546215295791626
should be 1.0, predicted: [ 0.87392503], target predicted: [ 0.81952667]
should be 1.0, predicted: [ 0.98620456], target predicted: [ 0.89171511]
start searching new action
get new action:  0.23539328575134277
start getting new_q
get new_q:  0.06643867492675781
train start:
critic loss [0.026780166, 0.02650414]
critic loss [0.026531914, 0.026266675]
critic loss [0.026066966, 0.02585379]
critic loss [0.02543268, 0.025293618]
critic loss [0.024689833, 0.02462245]
actor loss 285.457
train end 0.668764591217041
should be 1.0, predicted: [ 1.06938672], target predicted: [ 0.98365051]
start searching new action
get new action:  0.24476957321166992
start gett

Episode : 332 Replay Buffer 33623
TOTAL REWARD @ 332-th Episode  : Reward -1
Total Step: 140
start searching new action
get new action:  0.26372432708740234
start getting new_q
get new_q:  0.06755828857421875
train start:
critic loss [0.096572533, 0.034153048]
critic loss [0.091851883, 0.033585783]
critic loss [0.083847478, 0.032607187]
critic loss [0.074045032, 0.031432793]
critic loss [0.063929498, 0.030286415]
actor loss 188.902
train end 0.5269553661346436
should be 1.0, predicted: [ 0.65820032], target predicted: [ 0.645868]
should be 1.0, predicted: [ 1.05968726], target predicted: [ 0.80504644]
should be 1.0, predicted: [ 0.46345425], target predicted: [ 0.24177131]
should be 1.0, predicted: [ 0.58452553], target predicted: [ 0.35795668]
start searching new action
get new action:  0.2434988021850586
start getting new_q
get new_q:  0.08220624923706055
train start:
critic loss [0.028363319, 0.024147823]
critic loss [0.026737997, 0.023213865]
critic loss [0.024644304, 0.022084482]


critic loss [0.024906507, 0.02277844]
actor loss 107.816
train end 0.6497600078582764
should be 1.0, predicted: [ 1.02899408], target predicted: [ 1.04716527]
should be 1.0, predicted: [ 1.09208679], target predicted: [ 1.06090593]
should be -1.0, predicted: [-1.12549448], target predicted: [-1.52977026]
start searching new action
get new action:  0.24271726608276367
start getting new_q
get new_q:  0.06031346321105957
train start:
critic loss [0.021944545, 0.020254448]
critic loss [0.02203078, 0.02023593]
critic loss [0.021994732, 0.020247806]
critic loss [0.021810567, 0.020239882]
critic loss [0.021478936, 0.020173788]
actor loss 98.0427
train end 0.7955172061920166
should be 1.0, predicted: [ 0.81795758], target predicted: [ 0.81970125]
start searching new action
get new action:  0.2430894374847412
start getting new_q
get new_q:  0.06770539283752441
train start:
critic loss [0.024425892, 0.024366582]
critic loss [0.024415489, 0.024355024]
critic loss [0.024231456, 0.024177626]
critic

Episode : 336 Replay Buffer 34132
TOTAL REWARD @ 336-th Episode  : Reward -1
Total Step: 115
start searching new action
get new action:  0.2463388442993164
start getting new_q
get new_q:  0.08637022972106934
train start:
critic loss [0.050288033, 0.02879253]
critic loss [0.048805602, 0.028845923]
critic loss [0.046620492, 0.028823378]
critic loss [0.043932963, 0.028695114]
critic loss [0.040891748, 0.028439118]
actor loss 46.0689
train end 0.5579409599304199
should be -1.0, predicted: [-0.60729778], target predicted: [-0.57555145]
should be 1.0, predicted: [ 0.65740854], target predicted: [ 0.0129055]
start searching new action
get new action:  0.24124503135681152
start getting new_q
get new_q:  0.06560826301574707
train start:
critic loss [0.048707366, 0.032794505]
critic loss [0.046799891, 0.032545645]
critic loss [0.043528661, 0.031794041]
critic loss [0.039575785, 0.030732078]
critic loss [0.035564996, 0.029546466]
actor loss 29.9772
train end 0.6233270168304443
should be 1.0, pred

actor loss 60.0456
train end 0.6463723182678223
should be 1.0, predicted: [ 0.81785935], target predicted: [ 0.82086968]
Episode : 338 Replay Buffer 34267
TOTAL REWARD @ 338-th Episode  : Reward 1
Total Step: 119
start searching new action
get new action:  0.26842761039733887
start getting new_q
get new_q:  0.07247805595397949
train start:
critic loss [0.042788249, 0.031437419]
critic loss [0.041188642, 0.030718202]
critic loss [0.038778167, 0.029685806]
critic loss [0.035904571, 0.028480869]
critic loss [0.032934491, 0.027269486]
actor loss 105.884
train end 1.1012530326843262
should be 1.0, predicted: [ 0.57808119], target predicted: [ 0.51510561]
start searching new action
get new action:  0.25752735137939453
start getting new_q
get new_q:  0.06541728973388672
train start:
critic loss [0.050034583, 0.025571974]
critic loss [0.047646809, 0.025297336]
critic loss [0.043787867, 0.02494405]
critic loss [0.039267942, 0.024560556]
critic loss [0.034708388, 0.024170823]
actor loss 0.67992


critic loss [0.027526917, 0.0197994]
actor loss 75.4045
train end 0.5545096397399902
should be -1.0, predicted: [-0.62251431], target predicted: [-0.59507936]
should be 1.0, predicted: [ 1.09223878], target predicted: [ 0.88670319]
should be 1.0, predicted: [ 0.57929122], target predicted: [ 0.37208816]
start searching new action
get new action:  0.25224828720092773
start getting new_q
get new_q:  0.07284212112426758
train start:
critic loss [0.02776847, 0.024501806]
critic loss [0.027271925, 0.024167592]
critic loss [0.026498057, 0.023870364]
critic loss [0.025579289, 0.023599524]
critic loss [0.024658505, 0.023348089]
actor loss 159.471
train end 0.6493651866912842
should be 1.0, predicted: [ 0.92536002], target predicted: [ 0.85759932]
should be 1.0, predicted: [ 0.87585825], target predicted: [ 0.78483588]
trained action prob map predicted by initial model for a starting game
[[[ 0.13617589  0.01265609  0.01265983  0.01266188  0.01265109  0.01266187
    0.01269681]
  [ 0.01269656  

get new action:  0.24021530151367188
start getting new_q
get new_q:  0.06918907165527344
train start:
critic loss [0.075227559, 0.03650143]
critic loss [0.070805661, 0.034122348]
critic loss [0.063171156, 0.031149745]
critic loss [0.053955488, 0.028220521]
critic loss [0.044601005, 0.02581697]
actor loss 72.2997
train end 0.5064294338226318
should be 1.0, predicted: [ 0.90179777], target predicted: [ 1.04368317]
should be 1.0, predicted: [ 0.28262827], target predicted: [ 0.18476875]
start searching new action
get new action:  0.23783254623413086
start getting new_q
get new_q:  0.06311249732971191
train start:
critic loss [0.019219577, 0.017300922]
critic loss [0.019763293, 0.01816164]
critic loss [0.020186875, 0.019056492]
critic loss [0.020476254, 0.019745775]
critic loss [0.020489108, 0.02005576]
actor loss 100.761
train end 0.5029916763305664
should be 1.0, predicted: [ 1.05041683], target predicted: [ 1.26817155]
start searching new action
get new action:  0.2381577491760254
start

get new action:  0.2505044937133789
start getting new_q
get new_q:  0.0745992660522461
train start:
critic loss [0.032939121, 0.024146438]
critic loss [0.033162542, 0.024577513]
critic loss [0.032128014, 0.024740946]
critic loss [0.030184589, 0.024614196]
critic loss [0.027846316, 0.024253029]
actor loss 126.267
train end 0.6562144756317139
should be 1.0, predicted: [ 0.85120457], target predicted: [ 0.78489721]
should be 1.0, predicted: [ 0.83475184], target predicted: [ 0.70090914]
start searching new action
get new action:  0.2491450309753418
start getting new_q
get new_q:  0.07271718978881836
train start:
critic loss [0.097139001, 0.029646514]
critic loss [0.091642745, 0.028880969]
critic loss [0.081252649, 0.027102273]
critic loss [0.068892658, 0.02484612]
critic loss [0.056355868, 0.022558609]
actor loss 150.235
train end 0.5831394195556641
should be 1.0, predicted: [ 0.99902141], target predicted: [ 1.00760865]
should be 1.0, predicted: [ 0.93306839], target predicted: [ 0.71122

critic loss [0.031239901, 0.031239901]
actor loss 40.0593
train end 0.499164342880249
Episode : 345 Replay Buffer 35132
TOTAL REWARD @ 345-th Episode  : Reward 1
Total Step: 142
start searching new action
get new action:  0.23526811599731445
start getting new_q
get new_q:  0.06703782081604004
train start:
critic loss [0.024710465, 0.022533139]
critic loss [0.02372835, 0.021787776]
critic loss [0.022259884, 0.020697471]
critic loss [0.020611092, 0.01948018]
critic loss [0.019061424, 0.018320315]
actor loss 54.7427
train end 0.5327908992767334
should be -1.0, predicted: [-0.77907258], target predicted: [-0.56316799]
should be 1.0, predicted: [ 0.91280168], target predicted: [ 0.78865629]
start searching new action
get new action:  0.22504377365112305
start getting new_q
get new_q:  0.06410455703735352
train start:
critic loss [0.058793191, 0.035151325]
critic loss [0.054336064, 0.033939868]
critic loss [0.048825193, 0.03262832]
critic loss [0.043434318, 0.03145742]
critic loss [0.0389489

get new action:  0.23655986785888672
start getting new_q
get new_q:  0.07769608497619629
train start:
critic loss [0.17893645, 0.040658023]
critic loss [0.15329151, 0.037581488]
critic loss [0.11338375, 0.033404339]
critic loss [0.074264057, 0.030256039]
critic loss [0.047137309, 0.029731551]
actor loss 86.3781
train end 0.7533121109008789
should be 1.0, predicted: [ 1.19677329], target predicted: [ 0.60019159]
should be 1.0, predicted: [ 0.93650186], target predicted: [ 0.69976008]
should be 1.0, predicted: [ 0.89583504], target predicted: [ 0.15064238]
should be 1.0, predicted: [ 0.82131481], target predicted: [ 0.32367498]
should be 1.0, predicted: [ 0.85339212], target predicted: [ 0.62172204]
should be 1.0, predicted: [ 0.92091155], target predicted: [ 0.36871895]
start searching new action
get new action:  0.23266267776489258
start getting new_q
get new_q:  0.08639144897460938
train start:
critic loss [0.057507336, 0.041590281]
critic loss [0.066927694, 0.047859486]
critic loss [

get new action:  0.2345888614654541
start getting new_q
get new_q:  0.0659034252166748
train start:
critic loss [0.052789111, 0.022239631]
critic loss [0.050375119, 0.022360854]
critic loss [0.046346474, 0.022499805]
critic loss [0.041487984, 0.022655133]
critic loss [0.036618143, 0.022834811]
actor loss 105.887
train end 0.5113034248352051
should be -1.0, predicted: [-0.69994664], target predicted: [-0.77333915]
should be -1.0, predicted: [-0.61375517], target predicted: [-0.60404587]
should be 1.0, predicted: [ 1.0553056], target predicted: [ 0.78322661]
should be 1.0, predicted: [ 1.01049554], target predicted: [ 1.0230099]
should be 1.0, predicted: [ 0.69982183], target predicted: [ 0.55828714]
start searching new action
get new action:  0.22801709175109863
start getting new_q
get new_q:  0.06315970420837402
train start:
critic loss [0.078242615, 0.047362514]
critic loss [0.073873833, 0.047116213]
critic loss [0.066624373, 0.045720253]
critic loss [0.058372341, 0.043670584]
critic 

Episode : 350 Replay Buffer 35521
TOTAL REWARD @ 350-th Episode  : Reward -1
Total Step: 131
start searching new action
get new action:  0.2698187828063965
start getting new_q
get new_q:  0.0692896842956543
train start:
critic loss [0.035306934, 0.035187453]
critic loss [0.034460016, 0.034362815]
critic loss [0.033254914, 0.033164762]
critic loss [0.031848915, 0.031756267]
critic loss [0.030384516, 0.030281957]
actor loss 67.6926
train end 0.5049564838409424
should be 1.0, predicted: [ 0.96367425], target predicted: [ 0.81777543]
start searching new action
get new action:  0.2311077117919922
start getting new_q
get new_q:  0.07839822769165039
train start:
critic loss [0.071319476, 0.040999204]
critic loss [0.065694973, 0.038792528]
critic loss [0.057999846, 0.035894409]
critic loss [0.049925935, 0.032913953]
critic loss [0.042809423, 0.030298479]
actor loss 61.0765
train end 0.501758337020874
should be 1.0, predicted: [ 0.90088487], target predicted: [ 1.03377759]
should be -1.0, predi

start searching new action
get new action:  0.2489471435546875
start getting new_q
get new_q:  0.06780529022216797
train start:
critic loss [0.031944886, 0.028576847]
critic loss [0.03199511, 0.029074276]
critic loss [0.031442195, 0.029090907]
critic loss [0.030397899, 0.028667618]
critic loss [0.029041551, 0.027884811]
actor loss 32.6973
train end 0.6175775527954102
should be 1.0, predicted: [ 1.00666285], target predicted: [ 0.60335577]
should be 1.0, predicted: [ 1.16564512], target predicted: [ 1.09192634]
start searching new action
get new action:  0.2550089359283447
start getting new_q
get new_q:  0.06708073616027832
train start:
critic loss [0.074656673, 0.042443834]
critic loss [0.070885375, 0.041308582]
critic loss [0.065171421, 0.040033296]
critic loss [0.058644854, 0.038754895]
critic loss [0.052371703, 0.037530467]
actor loss 7.23667
train end 0.5635302066802979
should be 1.0, predicted: [ 0.68911809], target predicted: [ 0.20429413]
should be 1.0, predicted: [ 0.81177622],

critic loss [0.034164958, 0.030176811]
actor loss 11.6084
train end 0.5460762977600098
should be -1.0, predicted: [-0.84482068], target predicted: [-1.10045028]
should be 1.0, predicted: [ 0.74901325], target predicted: [ 0.84953326]
start searching new action
get new action:  0.25293517112731934
start getting new_q
get new_q:  0.07549834251403809
train start:
critic loss [0.031834207, 0.027939014]
critic loss [0.031530324, 0.027396515]
critic loss [0.031017873, 0.026851127]
critic loss [0.030315388, 0.026307255]
critic loss [0.029453121, 0.025763858]
actor loss 30.1193
train end 0.8778162002563477
should be 1.0, predicted: [ 0.99361861], target predicted: [ 0.92927957]
should be 1.0, predicted: [ 0.8241899], target predicted: [ 0.9021064]
should be -1.0, predicted: [-0.75217742], target predicted: [-0.77374911]
start searching new action
get new action:  0.23789048194885254
start getting new_q
get new_q:  0.07064223289489746
train start:
critic loss [0.092867509, 0.038448669]
critic l

critic loss [0.041494023, 0.026516451]
critic loss [0.037951343, 0.025838405]
critic loss [0.033359606, 0.024672946]
critic loss [0.02870819, 0.023259502]
actor loss 65.3093
train end 0.5761659145355225
should be 1.0, predicted: [ 1.22352767], target predicted: [ 1.17516327]
start searching new action
get new action:  0.23390722274780273
start getting new_q
get new_q:  0.07420039176940918
train start:
critic loss [0.06047925, 0.024713643]
critic loss [0.058622427, 0.023643374]
critic loss [0.054851312, 0.022597238]
critic loss [0.049940445, 0.021641336]
critic loss [0.044371869, 0.020810423]
actor loss 52.1517
train end 0.6997599601745605
should be 1.0, predicted: [ 0.66163534], target predicted: [ 0.58530217]
should be -1.0, predicted: [-0.85794234], target predicted: [-1.10142517]
should be 1.0, predicted: [ 0.43006021], target predicted: [ 0.27595589]
should be 1.0, predicted: [ 0.9577018], target predicted: [ 1.07133651]
should be 1.0, predicted: [ 0.74499965], target predicted: [ 

get new action:  0.2652719020843506
start getting new_q
get new_q:  0.07316350936889648
train start:
critic loss [0.026095323, 0.026015803]
critic loss [0.025702769, 0.025651017]
critic loss [0.02526607, 0.025237767]
critic loss [0.024745513, 0.024733696]
critic loss [0.024107944, 0.024104949]
actor loss 42.2361
train end 0.5166935920715332
should be -1.0, predicted: [-0.83210999], target predicted: [-0.61972892]
start searching new action
get new action:  0.23473143577575684
start getting new_q
get new_q:  0.06615781784057617
train start:
critic loss [0.037361152, 0.032146823]
critic loss [0.036276437, 0.031345144]
critic loss [0.034692623, 0.030319801]
critic loss [0.032799061, 0.029143665]
critic loss [0.030762319, 0.027879562]
actor loss -11.8092
train end 0.534919261932373
should be -1.0, predicted: [-0.78520799], target predicted: [-0.72852331]
should be 1.0, predicted: [ 1.08975351], target predicted: [ 1.07428956]
should be 1.0, predicted: [ 0.78480351], target predicted: [ 0.7

critic loss [0.034200117, 0.026934819]
critic loss [0.033740424, 0.027024899]
critic loss [0.032604314, 0.026823528]
critic loss [0.031040767, 0.026396714]
critic loss [0.029309966, 0.025816757]
actor loss 32.684
train end 0.9374992847442627
should be 1.0, predicted: [ 0.60829753], target predicted: [ 0.68099779]
should be 1.0, predicted: [ 0.88341743], target predicted: [ 0.73638809]
start searching new action
get new action:  0.24583673477172852
start getting new_q
get new_q:  0.06342029571533203
train start:
critic loss [0.090243995, 0.031254135]
critic loss [0.085692436, 0.030410388]
critic loss [0.078295976, 0.029291447]
critic loss [0.069517359, 0.028067477]
critic loss [0.060366731, 0.026856035]
actor loss 38.4504
train end 0.6939878463745117
should be 1.0, predicted: [ 0.77374411], target predicted: [ 0.65275818]
should be 1.0, predicted: [ 0.36974496], target predicted: [ 0.25804356]
should be 1.0, predicted: [ 0.75428396], target predicted: [ 0.7461434]
should be 1.0, predict

critic loss [0.048017263, 0.024730928]
critic loss [0.042963427, 0.024825361]
critic loss [0.038157016, 0.02505219]
critic loss [0.034466643, 0.025478423]
actor loss 105.198
train end 0.5184123516082764
should be 1.0, predicted: [ 1.11057758], target predicted: [ 0.81312805]
should be -1.0, predicted: [-0.67741585], target predicted: [-0.68339491]
should be -1.0, predicted: [-0.93451291], target predicted: [-0.81194806]
should be 1.0, predicted: [ 0.99051291], target predicted: [ 0.68210584]
should be 1.0, predicted: [ 1.14790761], target predicted: [ 0.9360624]
should be 1.0, predicted: [ 0.85780752], target predicted: [ 0.66154212]
should be -1.0, predicted: [-1.06770182], target predicted: [-0.9950543]
start searching new action
get new action:  0.24502110481262207
start getting new_q
get new_q:  0.07323861122131348
train start:
critic loss [0.036510684, 0.025819277]
critic loss [0.035467058, 0.026386615]
critic loss [0.033908933, 0.026723217]
critic loss [0.032068245, 0.026794139]


critic loss [0.083438136, 0.034257606]
critic loss [0.073256336, 0.032889448]
critic loss [0.060986646, 0.031313747]
critic loss [0.049145311, 0.029820437]
actor loss 20.3179
train end 0.557934045791626
should be 1.0, predicted: [ 0.87375712], target predicted: [ 0.44388357]
should be 1.0, predicted: [ 1.02844739], target predicted: [ 0.66551441]
should be 1.0, predicted: [ 0.52539366], target predicted: [ 0.21698363]
should be 1.0, predicted: [ 0.83252198], target predicted: [ 0.78811628]
start searching new action
get new action:  0.23168396949768066
start getting new_q
get new_q:  0.06272482872009277
train start:
critic loss [0.039050154, 0.028781472]
critic loss [0.036855727, 0.028509704]
critic loss [0.034398966, 0.028133878]
critic loss [0.032123163, 0.027759511]
critic loss [0.030263022, 0.027432553]
actor loss 43.4932
train end 0.5634701251983643
should be -1.0, predicted: [-1.10625362], target predicted: [-1.1642592]
should be 1.0, predicted: [ 0.92353702], target predicted: [

get new action:  0.24664783477783203
start getting new_q
get new_q:  0.06994867324829102
train start:
critic loss [0.11497328, 0.03443335]
critic loss [0.10775396, 0.033076566]
critic loss [0.095978208, 0.031103719]
critic loss [0.081912793, 0.028886795]
critic loss [0.067606948, 0.026738342]
actor loss 29.9529
train end 0.8240921497344971
should be -1.0, predicted: [-0.7752822], target predicted: [-0.76671106]
should be 1.0, predicted: [ 1.48705447], target predicted: [ 1.38282251]
should be 1.0, predicted: [ 0.95472926], target predicted: [ 0.97166747]
should be 1.0, predicted: [ 0.25143975], target predicted: [-0.46090207]
start searching new action
get new action:  0.2579305171966553
start getting new_q
get new_q:  0.07062482833862305
train start:
critic loss [0.030262427, 0.030262427]
critic loss [0.029583665, 0.029583665]
critic loss [0.02875397, 0.02875397]
critic loss [0.027852107, 0.027852107]
critic loss [0.026956396, 0.026956396]
actor loss 14.0239
train end 0.82731914520263

critic loss [0.032147057, 0.032147057]
critic loss [0.032060437, 0.032060437]
actor loss -4.28151
train end 0.8547701835632324
start searching new action
get new action:  0.3045964241027832
start getting new_q
get new_q:  0.10247969627380371
train start:
critic loss [0.03414274, 0.027816948]
critic loss [0.032361373, 0.026901148]
critic loss [0.029553358, 0.025441758]
critic loss [0.026482295, 0.023704413]
critic loss [0.023678195, 0.021873932]
actor loss 54.1025
train end 0.6100592613220215
should be 1.0, predicted: [ 0.93648225], target predicted: [ 0.79516774]
should be 1.0, predicted: [ 1.07764041], target predicted: [ 0.84523338]
should be -1.0, predicted: [-1.17286074], target predicted: [-0.79466158]
should be 1.0, predicted: [ 1.0730319], target predicted: [ 0.91384166]
Episode : 366 Replay Buffer 37306
TOTAL REWARD @ 366-th Episode  : Reward -1
Total Step: 125
start searching new action
get new action:  0.242964506149292
start getting new_q
get new_q:  0.07117080688476562
trai

should be 1.0, predicted: [ 0.78671283], target predicted: [ 0.66242915]
should be 1.0, predicted: [ 0.63155562], target predicted: [ 0.69733363]
start searching new action
get new action:  0.25230836868286133
start getting new_q
get new_q:  0.06766104698181152
train start:
critic loss [0.026546143, 0.023394797]
critic loss [0.027025383, 0.02350701]
critic loss [0.02686305, 0.023353485]
critic loss [0.025987562, 0.022859529]
critic loss [0.024505083, 0.02203666]
actor loss 94.3468
train end 0.8922975063323975
should be 1.0, predicted: [ 1.01717556], target predicted: [ 1.17168403]
should be 1.0, predicted: [ 0.67035925], target predicted: [ 0.6067946]
start searching new action
get new action:  0.29099178314208984
start getting new_q
get new_q:  0.07393717765808105
train start:
critic loss [0.039137829, 0.02988895]
critic loss [0.036973558, 0.028723804]
critic loss [0.033683289, 0.0272457]
critic loss [0.029956661, 0.025627935]
critic loss [0.026475854, 0.024067882]
actor loss 14.3456


critic loss [0.026087962, 0.022945199]
actor loss 33.2299
train end 0.5540673732757568
should be -1.0, predicted: [-0.69255209], target predicted: [-0.53099525]
should be 1.0, predicted: [ 0.88827747], target predicted: [ 0.59554732]
start searching new action
get new action:  0.2507445812225342
start getting new_q
get new_q:  0.06941986083984375
train start:
critic loss [0.049454924, 0.036197204]
critic loss [0.047673769, 0.035614491]
critic loss [0.044824302, 0.034549288]
critic loss [0.04127527, 0.033083521]
critic loss [0.037419867, 0.031358756]
actor loss 44.0975
train end 0.5346627235412598
should be 1.0, predicted: [ 0.37987268], target predicted: [ 0.57165945]
should be -1.0, predicted: [-0.77421707], target predicted: [-0.27822307]
start searching new action
get new action:  0.23279190063476562
start getting new_q
get new_q:  0.06356978416442871
train start:
critic loss [0.031817123, 0.025141254]
critic loss [0.030241773, 0.024047896]
critic loss [0.027963553, 0.022840034]
cri

get new action:  0.23845219612121582
start getting new_q
get new_q:  0.06126594543457031
train start:
critic loss [0.017427526, 0.017427526]
critic loss [0.01763746, 0.01763746]
critic loss [0.017946867, 0.017946867]
critic loss [0.018250529, 0.018250529]
critic loss [0.018477319, 0.018477319]
actor loss 63.0217
train end 0.5400714874267578
Episode : 371 Replay Buffer 37680
TOTAL REWARD @ 371-th Episode  : Reward 1
Total Step: 106
start searching new action
get new action:  0.25759243965148926
start getting new_q
get new_q:  0.08930277824401855
train start:
critic loss [0.026905822, 0.026905822]
critic loss [0.026711131, 0.026711131]
critic loss [0.026172034, 0.026172034]
critic loss [0.025353804, 0.025353804]
critic loss [0.024337951, 0.024337951]
actor loss 40.3318
train end 0.6174962520599365
start searching new action
get new action:  0.233292818069458
start getting new_q
get new_q:  0.061300039291381836
train start:
critic loss [0.022703497, 0.022703497]
critic loss [0.021842722, 

get new action:  0.2310171127319336
start getting new_q
get new_q:  0.07711076736450195
train start:
critic loss [0.045017451, 0.02600201]
critic loss [0.045090627, 0.027078081]
critic loss [0.043502387, 0.027910884]
critic loss [0.040703319, 0.028324844]
critic loss [0.037333678, 0.028317176]
actor loss 100.838
train end 0.5489771366119385
should be 1.0, predicted: [ 0.75328392], target predicted: [ 0.6844672]
should be 1.0, predicted: [ 0.72773153], target predicted: [ 0.68174946]
Episode : 373 Replay Buffer 37942
TOTAL REWARD @ 373-th Episode  : Reward -1
Total Step: 130
start searching new action
get new action:  0.2862987518310547
start getting new_q
get new_q:  0.09573245048522949
train start:
critic loss [0.023590695, 0.022325424]
critic loss [0.023806529, 0.022429237]
critic loss [0.023476414, 0.022155024]
critic loss [0.022784879, 0.021659562]
critic loss [0.021857612, 0.02100946]
actor loss 63.7965
train end 0.8197972774505615
should be 1.0, predicted: [ 1.15376198], target p

get new action:  0.2300121784210205
start getting new_q
get new_q:  0.07060027122497559
train start:
critic loss [0.0379778, 0.025445646]
critic loss [0.036805615, 0.025071513]
critic loss [0.034168653, 0.024339996]
critic loss [0.030713232, 0.023387354]
critic loss [0.027165353, 0.022356864]
actor loss 73.2444
train end 0.5333278179168701
should be 1.0, predicted: [ 0.52484977], target predicted: [ 0.65334851]
should be 1.0, predicted: [ 0.94248641], target predicted: [ 0.95492375]
should be 1.0, predicted: [ 0.95840156], target predicted: [ 0.74390113]
start searching new action
get new action:  0.24397826194763184
start getting new_q
get new_q:  0.06513643264770508
train start:
critic loss [0.031000946, 0.027989002]
critic loss [0.030682947, 0.027850311]
critic loss [0.029970258, 0.027523059]
critic loss [0.028984034, 0.027034504]
critic loss [0.027851328, 0.026421631]
actor loss -22.9068
train end 0.5297298431396484
should be 1.0, predicted: [ 0.86277181], target predicted: [ 0.800

critic loss [0.022498284, 0.022498284]
critic loss [0.02220396, 0.02220396]
critic loss [0.021958955, 0.021958955]
critic loss [0.021687098, 0.021687098]
actor loss 93.0213
train end 0.7984402179718018
start searching new action
get new action:  0.25284767150878906
start getting new_q
get new_q:  0.06491565704345703
train start:
critic loss [0.029911947, 0.028056689]
critic loss [0.029106323, 0.027787855]
critic loss [0.028044915, 0.027315041]
critic loss [0.026988268, 0.026715422]
critic loss [0.026065229, 0.026029246]
actor loss 36.3786
train end 1.0442242622375488
should be -1.0, predicted: [-0.89918852], target predicted: [-1.13738704]
start searching new action
get new action:  0.2923293113708496
start getting new_q
get new_q:  0.06534194946289062
train start:
critic loss [0.039673232, 0.028370388]
critic loss [0.039134663, 0.027911816]
critic loss [0.037344001, 0.027093915]
critic loss [0.034677796, 0.026037006]
critic loss [0.031591836, 0.0248843]
actor loss 125.132
train end 0.

start searching new action
get new action:  0.24951601028442383
start getting new_q
get new_q:  0.06399869918823242
train start:
critic loss [0.035513632, 0.034239698]
critic loss [0.036407277, 0.034889512]
critic loss [0.037196182, 0.035396211]
critic loss [0.037653156, 0.035602443]
critic loss [0.037696701, 0.035461999]
actor loss 30.8669
train end 0.5712423324584961
should be -1.0, predicted: [-0.82775223], target predicted: [-0.88276845]
should be 1.0, predicted: [ 0.91919601], target predicted: [ 0.24134366]
should be 1.0, predicted: [ 1.06950259], target predicted: [ 0.9088921]
start searching new action
get new action:  0.2502262592315674
start getting new_q
get new_q:  0.06811213493347168
train start:
critic loss [0.046681289, 0.036546577]
critic loss [0.045602933, 0.036244877]
critic loss [0.043484963, 0.03533278]
critic loss [0.040638171, 0.033933692]
critic loss [0.037408143, 0.032205507]
actor loss 46.9607
train end 0.514582633972168
should be 1.0, predicted: [ 0.61795706],

actor loss 77.4053
train end 0.6453495025634766
trained action prob map predicted by initial model for a starting game
[[[ 0.29804265  0.01196739  0.01197902  0.01197557  0.01196062  0.01197073
    0.0119977 ]
  [ 0.01200517  0.01198487  0.01196103  0.01196113  0.01197187  0.01196856
    0.01197067]
  [ 0.01197807  0.01196492  0.01196668  0.01197252  0.01196329  0.01199411
    0.01198597]
  [ 0.0119667   0.01196212  0.01196482  0.01196538  0.01196075  0.01196657
    0.0119802 ]
  [ 0.01201795  0.01196345  0.01196103  0.01197475  0.01196252  0.01196169
    0.01196206]
  [ 0.01198796  0.01196761  0.01196251  0.01196794  0.01197948  0.0119908
    0.01197211]
  [ 0.01199026  0.01199466  0.01196321  0.01197917  0.01197379  0.01199038
    0.13916765]]

 [[ 0.00603394  0.06888052  0.01364662  0.00603321  0.00603295  0.00604385
    0.00603408]
  [ 0.05127079  0.04595738  0.08645003  0.00604354  0.00603659  0.00604218
    0.00603377]
  [ 0.04922133  0.13742565  0.06549785  0.00604759  0.0060459

get new action:  0.2325739860534668
start getting new_q
get new_q:  0.07470130920410156
train start:
critic loss [0.036428653, 0.031806536]
critic loss [0.03488278, 0.030929726]
critic loss [0.033274088, 0.029911552]
critic loss [0.031740107, 0.028850613]
critic loss [0.030320583, 0.027809422]
actor loss 42.3547
train end 0.6831588745117188
should be 1.0, predicted: [ 0.53695923], target predicted: [ 0.32071033]
should be -1.0, predicted: [-1.04262316], target predicted: [-0.8216418]
start searching new action
get new action:  0.25298070907592773
start getting new_q
get new_q:  0.06604123115539551
train start:
critic loss [0.065269478, 0.024567664]
critic loss [0.063725054, 0.024507707]
critic loss [0.058480427, 0.023900531]
critic loss [0.050531704, 0.022856094]
critic loss [0.041824099, 0.021623969]
actor loss 69.4249
train end 0.5514745712280273
should be 1.0, predicted: [ 0.19640709], target predicted: [ 0.07043883]
start searching new action
get new action:  0.2315366268157959
sta

critic loss [0.042602897, 0.035560649]
critic loss [0.040794238, 0.034716867]
critic loss [0.03835509, 0.033416808]
critic loss [0.035529621, 0.031791061]
actor loss 14.0705
train end 0.509091854095459
should be -1.0, predicted: [-0.952254], target predicted: [-0.60590953]
should be 1.0, predicted: [ 1.05264795], target predicted: [ 0.82137585]
should be -1.0, predicted: [-1.28325796], target predicted: [-1.16895795]
start searching new action
get new action:  0.2243177890777588
start getting new_q
get new_q:  0.07045745849609375
train start:
critic loss [0.02860596, 0.028526191]
critic loss [0.028146848, 0.028071489]
critic loss [0.027293481, 0.027245387]
critic loss [0.026153024, 0.026137136]
critic loss [0.024876187, 0.024876164]
actor loss 106.017
train end 0.5201010704040527
should be 1.0, predicted: [ 0.98438066], target predicted: [ 0.94596094]
start searching new action
get new action:  0.2386012077331543
start getting new_q
get new_q:  0.0639336109161377
train start:
critic lo

critic loss [0.073718838, 0.031213162]
critic loss [0.066993043, 0.02992611]
critic loss [0.059227437, 0.028667852]
critic loss [0.051355131, 0.027573118]
actor loss 43.2654
train end 0.5954794883728027
should be 1.0, predicted: [ 0.8324191], target predicted: [ 0.27073362]
should be -1.0, predicted: [-0.63252437], target predicted: [-0.87058473]
should be 1.0, predicted: [ 0.98431838], target predicted: [ 0.96098566]
should be 1.0, predicted: [ 0.50065809], target predicted: [-0.23829019]
start searching new action
get new action:  0.23344850540161133
start getting new_q
get new_q:  0.06603169441223145
train start:
critic loss [0.050765269, 0.035627104]
critic loss [0.050838273, 0.03462182]
critic loss [0.048960045, 0.033260729]
critic loss [0.045494437, 0.031658549]
critic loss [0.041090585, 0.02995868]
actor loss 65.3576
train end 0.5625946521759033
should be -1.0, predicted: [-0.500009], target predicted: [-0.54719508]
should be -1.0, predicted: [-0.85919678], target predicted: [-0

get new action:  0.28295373916625977
start getting new_q
get new_q:  0.09605860710144043
train start:
critic loss [0.046496727, 0.026302919]
critic loss [0.043267448, 0.025441553]
critic loss [0.038218588, 0.024274325]
critic loss [0.032585882, 0.023045454]
critic loss [0.027460586, 0.021965466]
actor loss 23.8149
train end 0.9468445777893066
should be 1.0, predicted: [ 1.06316197], target predicted: [ 0.59065396]
should be 1.0, predicted: [ 0.89083773], target predicted: [ 0.52899367]
should be 1.0, predicted: [ 0.52844769], target predicted: [ 0.52915621]
should be 1.0, predicted: [ 1.12862492], target predicted: [ 0.94131315]
Episode : 387 Replay Buffer 39383
TOTAL REWARD @ 387-th Episode  : Reward 1
Total Step: 122
start searching new action
get new action:  0.2345290184020996
start getting new_q
get new_q:  0.08115100860595703
train start:
critic loss [0.17652002, 0.044092797]
critic loss [0.169819, 0.044137359]
critic loss [0.15699151, 0.043398816]
critic loss [0.14018361, 0.0420

get new action:  0.22557616233825684
start getting new_q
get new_q:  0.07033061981201172
train start:
critic loss [0.070992649, 0.037043259]
critic loss [0.068914168, 0.03668924]
critic loss [0.064419135, 0.03635693]
critic loss [0.058576755, 0.035945341]
critic loss [0.052519307, 0.035422735]
actor loss 1.97948
train end 0.4912235736846924
should be 1.0, predicted: [ 1.12194192], target predicted: [ 1.03716838]
should be 1.0, predicted: [ 1.06656647], target predicted: [ 0.85121995]
should be 1.0, predicted: [ 1.06270778], target predicted: [ 0.86657941]
should be 1.0, predicted: [ 0.53498214], target predicted: [ 0.15869088]
should be 1.0, predicted: [ 0.6823656], target predicted: [ 0.53253996]
start searching new action
get new action:  0.24870848655700684
start getting new_q
get new_q:  0.07024359703063965
train start:
critic loss [0.036816448, 0.02871329]
critic loss [0.035300907, 0.028653298]
critic loss [0.03279981, 0.028093245]
critic loss [0.029912701, 0.027152322]
critic los

critic loss [0.054187339, 0.02857678]
actor loss 29.5606
train end 0.6426265239715576
should be -1.0, predicted: [-0.8141917], target predicted: [-0.64367449]
should be 1.0, predicted: [ 0.86404443], target predicted: [ 0.87530524]
should be -1.0, predicted: [-0.37066117], target predicted: [ 0.05567354]
should be -1.0, predicted: [-0.86677504], target predicted: [-0.92562872]
start searching new action
get new action:  0.2315688133239746
start getting new_q
get new_q:  0.07056140899658203
train start:
critic loss [0.13451517, 0.034870289]
critic loss [0.12992476, 0.034247667]
critic loss [0.11630225, 0.032376692]
critic loss [0.096637785, 0.029592797]
critic loss [0.075354807, 0.026540738]
actor loss 45.2542
train end 0.9202773571014404
should be 1.0, predicted: [ 0.57792091], target predicted: [ 0.44036651]
should be 1.0, predicted: [ 0.95078427], target predicted: [ 0.83276784]
should be 1.0, predicted: [ 0.14760461], target predicted: [ 0.33614689]
should be -1.0, predicted: [-0.88

critic loss [0.052677274, 0.029768787]
critic loss [0.048101466, 0.028681481]
critic loss [0.042498104, 0.027431736]
critic loss [0.036873765, 0.026190644]
actor loss 90.2758
train end 0.5870845317840576
should be 1.0, predicted: [ 0.70053023], target predicted: [ 0.39591032]
should be 1.0, predicted: [ 1.0540725], target predicted: [ 0.93672568]
should be 1.0, predicted: [ 0.79400563], target predicted: [ 0.48902407]
should be 1.0, predicted: [ 0.89234298], target predicted: [ 0.84897298]
start searching new action
get new action:  0.25800299644470215
start getting new_q
get new_q:  0.06493353843688965
train start:
critic loss [0.047003131, 0.027222756]
critic loss [0.043842956, 0.02604706]
critic loss [0.040147576, 0.024922686]
critic loss [0.036362454, 0.02395469]
critic loss [0.032914113, 0.023213753]
actor loss 49.7909
train end 0.5491628646850586
should be 1.0, predicted: [ 0.9803465], target predicted: [ 0.89800829]
should be 1.0, predicted: [ 0.50892824], target predicted: [ 0.

get new action:  0.2455286979675293
start getting new_q
get new_q:  0.07381057739257812
train start:
critic loss [0.028169017, 0.025172653]
critic loss [0.028116276, 0.025601996]
critic loss [0.027647732, 0.025722733]
critic loss [0.026840046, 0.025523461]
critic loss [0.025796341, 0.025023762]
actor loss 68.9199
train end 0.6129360198974609
should be -1.0, predicted: [-0.92164385], target predicted: [-1.02654648]
should be 1.0, predicted: [ 0.94396526], target predicted: [ 0.40795228]
should be 1.0, predicted: [ 0.95222348], target predicted: [ 0.82518119]
should be 1.0, predicted: [ 0.96209574], target predicted: [ 0.72706753]
Episode : 394 Replay Buffer 40256
TOTAL REWARD @ 394-th Episode  : Reward -1
Total Step: 105
start searching new action
get new action:  0.2475111484527588
start getting new_q
get new_q:  0.06927609443664551
train start:
critic loss [0.028775714, 0.027501525]
critic loss [0.028111855, 0.026927615]
critic loss [0.027135335, 0.026066985]
critic loss [0.025966002,

critic loss [0.033740561, 0.028641153]
actor loss 84.435
train end 0.50211501121521
should be 1.0, predicted: [ 1.28912854], target predicted: [ 1.15065598]
should be 1.0, predicted: [ 1.10940337], target predicted: [ 0.68640816]
start searching new action
get new action:  0.23952817916870117
start getting new_q
get new_q:  0.06240725517272949
train start:
critic loss [0.068943247, 0.029605836]
critic loss [0.065464064, 0.029500186]
critic loss [0.060057215, 0.029384132]
critic loss [0.053553917, 0.029233029]
critic loss [0.046786055, 0.029035833]
actor loss 66.4661
train end 0.511528491973877
should be -1.0, predicted: [-0.17033316], target predicted: [-0.62490141]
should be 1.0, predicted: [ 0.87597871], target predicted: [ 0.57768226]
should be 1.0, predicted: [ 1.27492392], target predicted: [ 0.98940718]
should be 1.0, predicted: [ 0.99279672], target predicted: [ 0.03694845]
start searching new action
get new action:  0.2569406032562256
start getting new_q
get new_q:  0.073300361

critic loss [0.038098082, 0.030440971]
critic loss [0.035228938, 0.029548613]
actor loss 71.8925
train end 0.6685113906860352
should be 1.0, predicted: [ 0.93339986], target predicted: [ 1.090446]
should be 1.0, predicted: [ 0.68702459], target predicted: [ 0.68341208]
should be 1.0, predicted: [ 0.82698286], target predicted: [ 0.76137137]
should be 1.0, predicted: [ 0.97912896], target predicted: [ 0.64522696]
start searching new action
get new action:  0.2401740550994873
start getting new_q
get new_q:  0.06544113159179688
train start:
critic loss [0.027113307, 0.021682806]
critic loss [0.026320968, 0.021604061]
critic loss [0.025429096, 0.021479018]
critic loss [0.024561591, 0.021315407]
critic loss [0.023745816, 0.021107279]
actor loss 97.8909
train end 0.5975854396820068
should be 1.0, predicted: [ 1.13479698], target predicted: [ 1.00776577]
should be 1.0, predicted: [ 0.99101555], target predicted: [ 0.94609421]
should be 1.0, predicted: [ 0.8106913], target predicted: [ 0.55066

get new action:  0.23519563674926758
start getting new_q
get new_q:  0.06278848648071289
train start:
critic loss [0.046923101, 0.02782909]
critic loss [0.044484444, 0.027138965]
critic loss [0.040217973, 0.025791401]
critic loss [0.035084382, 0.02411923]
critic loss [0.030082287, 0.022446547]
actor loss 27.2825
train end 0.5971846580505371
should be -1.0, predicted: [-0.51786733], target predicted: [-0.32528532]
should be 1.0, predicted: [ 1.06769586], target predicted: [ 0.96285748]
should be 1.0, predicted: [ 0.91733915], target predicted: [ 0.87183577]
start searching new action
get new action:  0.23847675323486328
start getting new_q
get new_q:  0.07426762580871582
train start:
critic loss [0.044488937, 0.044199459]
critic loss [0.042983137, 0.042610161]
critic loss [0.041187294, 0.040792152]
critic loss [0.039167374, 0.038814478]
critic loss [0.037011281, 0.03674829]
actor loss -32.3363
train end 0.587083101272583
should be 1.0, predicted: [ 0.88891387], target predicted: [ 0.771

get new action:  0.24050307273864746
start getting new_q
get new_q:  0.07219648361206055
train start:
critic loss [0.074808389, 0.032426193]
critic loss [0.067120567, 0.029376462]
critic loss [0.056796126, 0.026011519]
critic loss [0.045831092, 0.022917934]
critic loss [0.036055848, 0.020539925]
actor loss 1.59534
train end 0.801485538482666
should be 1.0, predicted: [ 0.99249041], target predicted: [ 0.80681026]
should be 1.0, predicted: [ 1.00363219], target predicted: [ 0.86245799]
should be -1.0, predicted: [-0.68490803], target predicted: [-0.3735005]
should be 1.0, predicted: [ 0.58691609], target predicted: [ 0.4552711]
start searching new action
get new action:  0.245391845703125
start getting new_q
get new_q:  0.0706641674041748
train start:
critic loss [0.03075191, 0.028814888]
critic loss [0.030292863, 0.029108725]
critic loss [0.030112602, 0.02947427]
critic loss [0.029940013, 0.0296489]
critic loss [0.029571032, 0.029467437]
actor loss -25.6954
train end 0.7103447914123535

get new action:  0.24166345596313477
start getting new_q
get new_q:  0.06263184547424316
train start:
critic loss [0.023316989, 0.016915545]
critic loss [0.02283605, 0.016632184]
critic loss [0.021546297, 0.01619046]
critic loss [0.019755818, 0.015650475]
critic loss [0.017827138, 0.015085777]
actor loss 78.8981
train end 0.6027476787567139
should be 1.0, predicted: [ 1.14241636], target predicted: [ 1.01053882]
should be 1.0, predicted: [ 1.04490805], target predicted: [ 0.92841178]
start searching new action
get new action:  0.2578122615814209
start getting new_q
get new_q:  0.07219505310058594
train start:
critic loss [0.048155613, 0.022033073]
critic loss [0.04513365, 0.021771435]
critic loss [0.040652167, 0.021261234]
critic loss [0.035528503, 0.020662103]
critic loss [0.030538861, 0.020112388]
actor loss 48.6778
train end 0.5370395183563232
should be 1.0, predicted: [ 0.49146199], target predicted: [ 0.34766898]
Episode : 403 Replay Buffer 41474
TOTAL REWARD @ 403-th Episode  : R

should be 1.0, predicted: [ 0.61051172], target predicted: [ 0.65703446]
start searching new action
get new action:  0.2540903091430664
start getting new_q
get new_q:  0.0747075080871582
train start:
critic loss [0.040754683, 0.020033525]
critic loss [0.038214985, 0.019777983]
critic loss [0.03406933, 0.019218931]
critic loss [0.02935667, 0.018557299]
critic loss [0.024869964, 0.017959934]
actor loss 77.2325
train end 0.6179032325744629
should be 1.0, predicted: [ 0.63037205], target predicted: [ 0.5868451]
should be 1.0, predicted: [ 0.78074217], target predicted: [ 0.75194782]
start searching new action
get new action:  0.2442305088043213
start getting new_q
get new_q:  0.06971144676208496
train start:
critic loss [0.059973922, 0.025113713]
critic loss [0.057314083, 0.025122225]
critic loss [0.052660927, 0.024770819]
critic loss [0.046914496, 0.024154231]
critic loss [0.040613674, 0.023348346]
actor loss 59.0835
train end 0.6684656143188477
should be 1.0, predicted: [ 0.7950263], tar

critic loss [0.025916666, 0.024313267]
actor loss 18.9325
train end 0.5895776748657227
should be 1.0, predicted: [ 0.89385206], target predicted: [ 0.84510034]
should be 1.0, predicted: [ 0.8889007], target predicted: [ 0.82333338]
start searching new action
get new action:  0.22980928421020508
start getting new_q
get new_q:  0.0792994499206543
train start:
critic loss [0.02142393, 0.018439569]
critic loss [0.021858346, 0.018469023]
critic loss [0.021685712, 0.018206574]
critic loss [0.020936651, 0.017659331]
critic loss [0.019732654, 0.016901024]
actor loss 44.8239
train end 0.5719890594482422
should be 1.0, predicted: [ 0.79597563], target predicted: [ 0.57207799]
start searching new action
get new action:  0.24229764938354492
start getting new_q
get new_q:  0.06800436973571777
train start:
critic loss [0.11819644, 0.028750747]
critic loss [0.11087789, 0.027621502]
critic loss [0.097700506, 0.025762904]
critic loss [0.081282102, 0.023615148]
critic loss [0.064255551, 0.021565143]
act

get new action:  0.2689995765686035
start getting new_q
get new_q:  0.08719062805175781
train start:
critic loss [0.022556588, 0.018323062]
critic loss [0.022820391, 0.018313052]
critic loss [0.022679714, 0.018210337]
critic loss [0.022189988, 0.018010695]
critic loss [0.021425676, 0.017729068]
actor loss 12.4218
train end 0.5979185104370117
should be -1.0, predicted: [-0.87495887], target predicted: [-0.7138536]
should be 1.0, predicted: [ 0.74121374], target predicted: [ 0.87461209]
should be 1.0, predicted: [ 0.9424895], target predicted: [ 0.83374089]
start searching new action
get new action:  0.2617664337158203
start getting new_q
get new_q:  0.07973170280456543
train start:
critic loss [0.044844344, 0.026106404]
critic loss [0.043767687, 0.026025776]
critic loss [0.041602507, 0.025763247]
critic loss [0.038771875, 0.025363581]
critic loss [0.035709262, 0.02487272]
actor loss 56.8536
train end 0.7534539699554443
should be -1.0, predicted: [-1.02719355], target predicted: [-0.9322

Episode : 410 Replay Buffer 42276
TOTAL REWARD @ 410-th Episode  : Reward 1
Total Step: 42
start searching new action
get new action:  0.2521798610687256
start getting new_q
get new_q:  0.06897783279418945
train start:
critic loss [0.035487361, 0.027145531]
critic loss [0.034664117, 0.026884338]
critic loss [0.033012073, 0.026283247]
critic loss [0.030841881, 0.025465775]
critic loss [0.028471287, 0.024540082]
actor loss 86.7151
train end 0.552666425704956
should be 1.0, predicted: [ 1.01321852], target predicted: [ 0.59534717]
start searching new action
get new action:  0.24474072456359863
start getting new_q
get new_q:  0.06798028945922852
train start:
critic loss [0.04010075, 0.026035946]
critic loss [0.037414007, 0.025196765]
critic loss [0.034130916, 0.024280408]
critic loss [0.030843267, 0.023466311]
critic loss [0.027988892, 0.022861596]
actor loss 60.4162
train end 0.6228163242340088
should be 1.0, predicted: [ 1.11315882], target predicted: [ 0.5572958]
should be 1.0, predicte

critic loss [0.032611098, 0.030991562]
actor loss 43.4339
train end 0.5568182468414307
should be 1.0, predicted: [ 0.86888295], target predicted: [ 0.66287309]
Episode : 412 Replay Buffer 42521
TOTAL REWARD @ 412-th Episode  : Reward 1
Total Step: 108
start searching new action
get new action:  0.2600398063659668
start getting new_q
get new_q:  0.07277488708496094
train start:
critic loss [0.023762815, 0.023551006]
critic loss [0.023120817, 0.022914182]
critic loss [0.022290608, 0.022106679]
critic loss [0.021356396, 0.021206712]
critic loss [0.020408146, 0.020297419]
actor loss 28.219
train end 0.6161210536956787
should be 1.0, predicted: [ 0.9434616], target predicted: [ 0.91131139]
start searching new action
get new action:  0.23508262634277344
start getting new_q
get new_q:  0.06332802772521973
train start:
critic loss [0.080047764, 0.030912742]
critic loss [0.069617115, 0.029610738]
critic loss [0.055044342, 0.027772166]
critic loss [0.040958729, 0.025980417]
critic loss [0.034672

critic loss [0.023587663, 0.021275878]
actor loss 17.5221
train end 0.61962890625
should be 1.0, predicted: [ 1.04230618], target predicted: [ 0.22220935]
should be -1.0, predicted: [-0.95792657], target predicted: [-1.16509235]
should be 1.0, predicted: [ 1.1765362], target predicted: [ 0.91844428]
start searching new action
get new action:  0.23981809616088867
start getting new_q
get new_q:  0.06785750389099121
train start:
critic loss [0.023454458, 0.020695299]
critic loss [0.022881411, 0.020409541]
critic loss [0.022155095, 0.020079594]
critic loss [0.021395953, 0.019732669]
critic loss [0.020626277, 0.019384265]
actor loss 67.9516
train end 0.6145439147949219
should be 1.0, predicted: [ 1.24057233], target predicted: [ 0.28231698]
Episode : 414 Replay Buffer 42779
TOTAL REWARD @ 414-th Episode  : Reward 1
Total Step: 101
start searching new action
get new action:  0.24887466430664062
start getting new_q
get new_q:  0.06795072555541992
train start:
critic loss [0.16560234, 0.045330

get new action:  0.24588465690612793
start getting new_q
get new_q:  0.07114720344543457
train start:
critic loss [0.033719607, 0.022350594]
critic loss [0.032403596, 0.02271645]
critic loss [0.030249961, 0.022783253]
critic loss [0.027718833, 0.022590861]
critic loss [0.025220722, 0.022192888]
actor loss 14.7158
train end 0.64882493019104
should be 1.0, predicted: [ 0.8644523], target predicted: [ 0.59603965]
should be 1.0, predicted: [ 0.77132159], target predicted: [ 0.61242437]
start searching new action
get new action:  0.271575927734375
start getting new_q
get new_q:  0.07925534248352051
train start:
critic loss [0.03552755, 0.030855278]
critic loss [0.035445087, 0.030833535]
critic loss [0.034526348, 0.030377133]
critic loss [0.03306647, 0.02963078]
critic loss [0.031384759, 0.028735671]
actor loss 49.2024
train end 0.8050551414489746
should be -1.0, predicted: [-0.76579446], target predicted: [-0.81399769]
should be 1.0, predicted: [ 0.93433332], target predicted: [ 0.96731293]

get new action:  0.2508354187011719
start getting new_q
get new_q:  0.07497763633728027
train start:
critic loss [0.025357872, 0.018100269]
critic loss [0.023866853, 0.01735086]
critic loss [0.021916509, 0.016571537]
critic loss [0.019804679, 0.015825231]
critic loss [0.017789168, 0.015154053]
actor loss 42.863
train end 0.5201601982116699
should be 1.0, predicted: [ 0.48832762], target predicted: [ 0.90306437]
start searching new action
get new action:  0.2326047420501709
start getting new_q
get new_q:  0.07475566864013672
train start:
critic loss [0.027644146, 0.023511369]
critic loss [0.026311157, 0.022949241]
critic loss [0.02478634, 0.022329384]
critic loss [0.023356713, 0.021717802]
critic loss [0.022229414, 0.021174889]
actor loss 58.3351
train end 0.801959753036499
should be 1.0, predicted: [ 1.07988226], target predicted: [ 1.08155036]
should be -1.0, predicted: [-0.95255178], target predicted: [-0.87204808]
should be -1.0, predicted: [-1.08313906], target predicted: [-1.08446

critic loss [0.059069071, 0.040766422]
critic loss [0.060541853, 0.043276872]
critic loss [0.058995958, 0.043854915]
critic loss [0.054962043, 0.042708397]
actor loss -7.30376
train end 0.6248581409454346
should be 1.0, predicted: [ 0.36139587], target predicted: [ 0.45055452]
should be 1.0, predicted: [ 0.98919022], target predicted: [ 0.88774574]
start searching new action
get new action:  0.23919677734375
start getting new_q
get new_q:  0.07177901268005371
train start:
critic loss [0.046239074, 0.039425887]
critic loss [0.042216096, 0.036943935]
critic loss [0.037659019, 0.034251742]
critic loss [0.033851713, 0.031947192]
critic loss [0.031457998, 0.030272884]
actor loss 52.671
train end 0.6437482833862305
should be 1.0, predicted: [ 0.80312163], target predicted: [ 0.86689669]
should be -1.0, predicted: [-1.07520115], target predicted: [-0.98468906]
start searching new action
get new action:  0.2492358684539795
start getting new_q
get new_q:  0.06788516044616699
train start:
critic

Episode : 421 Replay Buffer 43576
TOTAL REWARD @ 421-th Episode  : Reward -1
Total Step: 142
start searching new action
get new action:  0.24213886260986328
start getting new_q
get new_q:  0.07428646087646484
train start:
critic loss [0.02815398, 0.027062155]
critic loss [0.028050985, 0.026939455]
critic loss [0.027310416, 0.026242975]
critic loss [0.026115632, 0.025151376]
critic loss [0.024668641, 0.023852788]
actor loss 53.269
train end 0.5270380973815918
should be 1.0, predicted: [ 0.72340006], target predicted: [ 0.60075921]
start searching new action
get new action:  0.23366951942443848
start getting new_q
get new_q:  0.0939323902130127
train start:
critic loss [0.049200833, 0.02696329]
critic loss [0.046019968, 0.025808612]
critic loss [0.0416722, 0.02450287]
critic loss [0.036921479, 0.0232477]
critic loss [0.032451443, 0.022195123]
actor loss 55.8183
train end 0.5994181632995605
should be -1.0, predicted: [-1.31728804], target predicted: [-1.46490049]
should be -1.0, predicted

critic loss [0.033947468, 0.026469998]
critic loss [0.032436531, 0.025713066]
critic loss [0.030398842, 0.024690127]
critic loss [0.028111706, 0.023522042]
actor loss 50.608
train end 0.5183160305023193
should be 1.0, predicted: [ 0.91844243], target predicted: [ 0.82331806]
should be 1.0, predicted: [ 0.76378775], target predicted: [ 0.67394048]
should be -1.0, predicted: [-0.78290713], target predicted: [-0.73612374]
should be 1.0, predicted: [ 0.81062573], target predicted: [ 0.63766688]
start searching new action
get new action:  0.24313831329345703
start getting new_q
get new_q:  0.06524038314819336
train start:
critic loss [0.057657346, 0.019008657]
critic loss [0.054254621, 0.018297952]
critic loss [0.048819892, 0.017528882]
critic loss [0.042435344, 0.016868941]
critic loss [0.03589448, 0.016417038]
actor loss 81.6882
train end 0.6122934818267822
should be -1.0, predicted: [-1.2020787], target predicted: [-1.18144119]
should be 1.0, predicted: [ 0.59162247], target predicted: [

get new action:  0.23723769187927246
start getting new_q
get new_q:  0.06161689758300781
train start:
critic loss [0.022772063, 0.01875475]
critic loss [0.023625597, 0.019299183]
critic loss [0.023533586, 0.019470576]
critic loss [0.022581752, 0.019245826]
critic loss [0.021046937, 0.018685168]
actor loss 64.0518
train end 0.5578396320343018
should be -1.0, predicted: [-1.05893636], target predicted: [-1.04727483]
should be 1.0, predicted: [ 1.03024554], target predicted: [ 0.92380708]
start searching new action
get new action:  0.2457423210144043
start getting new_q
get new_q:  0.07106161117553711
train start:
critic loss [0.023126177, 0.022775568]
critic loss [0.022060614, 0.021905936]
critic loss [0.021055307, 0.021027409]
critic loss [0.020214507, 0.020210356]
critic loss [0.019560881, 0.019481547]
actor loss 6.04576
train end 1.0235075950622559
should be -1.0, predicted: [-0.76993084], target predicted: [-0.74025309]
start searching new action
get new action:  0.24373078346252441


get new action:  0.26432180404663086
start getting new_q
get new_q:  0.10440254211425781
train start:
critic loss [0.039221279, 0.020965474]
critic loss [0.036902618, 0.019725127]
critic loss [0.033692174, 0.018622903]
critic loss [0.030172162, 0.017676391]
critic loss [0.026693892, 0.016868344]
actor loss 32.0842
train end 0.5734121799468994
should be 1.0, predicted: [ 0.91943294], target predicted: [ 0.68795252]
should be 1.0, predicted: [ 0.57060885], target predicted: [ 0.42639443]
should be 1.0, predicted: [ 1.04190993], target predicted: [ 0.75383592]
start searching new action
get new action:  0.25521159172058105
start getting new_q
get new_q:  0.07027935981750488
train start:
critic loss [0.057060163, 0.040519539]
critic loss [0.056338083, 0.040424444]
critic loss [0.054236077, 0.039843895]
critic loss [0.051105969, 0.038861923]
critic loss [0.047425263, 0.037605669]
actor loss -6.03844
train end 0.5276167392730713
should be -1.0, predicted: [-0.64290255], target predicted: [-0

get new action:  0.23753929138183594
start getting new_q
get new_q:  0.07897806167602539
train start:
critic loss [0.030064519, 0.029729851]
critic loss [0.029170079, 0.028937649]
critic loss [0.027654616, 0.027552886]
critic loss [0.025799088, 0.025786953]
critic loss [0.023868162, 0.023853343]
actor loss -6.52831
train end 0.6155400276184082
should be 1.0, predicted: [ 0.63234609], target predicted: [ 0.62883645]
start searching new action
get new action:  0.23626399040222168
start getting new_q
get new_q:  0.06297850608825684
train start:
critic loss [0.034752734, 0.018034581]
critic loss [0.035034299, 0.017847672]
critic loss [0.033983734, 0.017629456]
critic loss [0.031818617, 0.017310614]
critic loss [0.028888837, 0.016892236]
actor loss 25.4914
train end 0.5630159378051758
should be 1.0, predicted: [ 0.49097186], target predicted: [ 0.71753371]
should be -1.0, predicted: [-1.11324215], target predicted: [-1.00416064]
should be 1.0, predicted: [ 0.74831027], target predicted: [ 0

Episode : 430 Replay Buffer 44667
TOTAL REWARD @ 430-th Episode  : Reward 1
Total Step: 124
start searching new action
get new action:  0.2756078243255615
start getting new_q
get new_q:  0.07140922546386719
train start:
critic loss [0.026372671, 0.023491714]
critic loss [0.026568154, 0.023123514]
critic loss [0.026307933, 0.022611696]
critic loss [0.025586803, 0.0219646]
critic loss [0.024482878, 0.021220556]
actor loss 63.6964
train end 0.6556696891784668
should be 1.0, predicted: [ 0.73103327], target predicted: [ 0.70345533]
start searching new action
get new action:  0.2432231903076172
start getting new_q
get new_q:  0.06726741790771484
train start:
critic loss [0.06185272, 0.028985493]
critic loss [0.056924887, 0.027718699]
critic loss [0.049255274, 0.025827171]
critic loss [0.040819362, 0.023806822]
critic loss [0.033573806, 0.022132318]
actor loss 109.394
train end 0.5618500709533691
should be 1.0, predicted: [ 0.64975935], target predicted: [ 0.47019392]
should be 1.0, predicte

get new action:  0.23870301246643066
start getting new_q
get new_q:  0.06596612930297852
train start:
critic loss [0.029697098, 0.021968249]
critic loss [0.028769502, 0.021413319]
critic loss [0.027339058, 0.020744115]
critic loss [0.025555223, 0.020000672]
critic loss [0.023609802, 0.019235397]
actor loss 13.5159
train end 0.5887823104858398
should be -1.0, predicted: [-0.94517529], target predicted: [-0.92268103]
should be 1.0, predicted: [ 0.94075036], target predicted: [ 1.00818741]
Episode : 432 Replay Buffer 44919
TOTAL REWARD @ 432-th Episode  : Reward -1
Total Step: 139
start searching new action
get new action:  0.2522289752960205
start getting new_q
get new_q:  0.06965208053588867
train start:
critic loss [0.037572276, 0.023718508]
critic loss [0.03613288, 0.023602314]
critic loss [0.034150813, 0.023518153]
critic loss [0.031830352, 0.023426197]
critic loss [0.029421732, 0.023281541]
actor loss 42.4581
train end 0.6332123279571533
should be 1.0, predicted: [ 1.00081813], targ

critic loss [0.034431744, 0.01952241]
critic loss [0.027826643, 0.018160664]
critic loss [0.022891663, 0.017527444]
critic loss [0.01989061, 0.017529018]
actor loss 49.5508
train end 0.5608036518096924
should be -1.0, predicted: [-0.93753588], target predicted: [-0.9493413]
should be 1.0, predicted: [ 1.14407408], target predicted: [ 0.72139478]
should be -1.0, predicted: [-1.01675534], target predicted: [-0.86081773]
start searching new action
get new action:  0.2295699119567871
start getting new_q
get new_q:  0.07263922691345215
train start:
critic loss [0.043793887, 0.038040072]
critic loss [0.045853086, 0.03925886]
critic loss [0.04639703, 0.039580312]
critic loss [0.045400396, 0.038985521]
critic loss [0.04311844, 0.037632421]
actor loss 7.23686
train end 0.5940277576446533
should be 1.0, predicted: [ 0.51889092], target predicted: [ 0.67090696]
Episode : 434 Replay Buffer 45146
TOTAL REWARD @ 434-th Episode  : Reward -1
Total Step: 102
start searching new action
get new action:  

critic loss [0.084473006, 0.030987963]
actor loss 37.9824
train end 0.5224490165710449
should be 1.0, predicted: [ 0.95708114], target predicted: [ 0.82377923]
should be 1.0, predicted: [ 0.58742845], target predicted: [ 0.33550858]
should be 1.0, predicted: [ 0.17612807], target predicted: [-0.56528497]
should be 1.0, predicted: [ 0.79515123], target predicted: [ 0.75059092]
start searching new action
get new action:  0.2401871681213379
start getting new_q
get new_q:  0.07577633857727051
train start:
critic loss [0.042053171, 0.022186011]
critic loss [0.042159479, 0.023184255]
critic loss [0.041870501, 0.024006683]
critic loss [0.040948652, 0.024540968]
critic loss [0.039282866, 0.024722353]
actor loss 50.1064
train end 0.5396981239318848
should be 1.0, predicted: [ 0.82079971], target predicted: [ 0.45112014]
should be -1.0, predicted: [-0.56034273], target predicted: [-0.88398486]
should be 1.0, predicted: [ 0.92508596], target predicted: [ 0.93250483]
should be 1.0, predicted: [ 1.

get new action:  0.2433171272277832
start getting new_q
get new_q:  0.06244039535522461
train start:
critic loss [0.095595747, 0.032819472]
critic loss [0.090837084, 0.032287683]
critic loss [0.083747953, 0.031556569]
critic loss [0.075332426, 0.030786045]
critic loss [0.066542432, 0.030098923]
actor loss 62.3249
train end 0.5098848342895508
should be 1.0, predicted: [ 0.66266465], target predicted: [ 0.61705667]
should be 1.0, predicted: [-0.10804711], target predicted: [-0.10098317]
should be 1.0, predicted: [ 0.97485256], target predicted: [ 1.25306213]
start searching new action
get new action:  0.23248553276062012
start getting new_q
get new_q:  0.06370091438293457
train start:
critic loss [0.10101247, 0.026514916]
critic loss [0.093864188, 0.027280668]
critic loss [0.084407665, 0.028000152]
critic loss [0.074047536, 0.028730975]
critic loss [0.063997269, 0.029470962]
actor loss 53.808
train end 0.5048222541809082
should be 1.0, predicted: [ 0.7006523], target predicted: [ 0.78425

get new action:  0.23771142959594727
start getting new_q
get new_q:  0.0646512508392334
train start:
critic loss [0.040110461, 0.025287103]
critic loss [0.039097536, 0.025060046]
critic loss [0.036932215, 0.024538422]
critic loss [0.034024492, 0.023806404]
critic loss [0.031098668, 0.022998489]
actor loss 34.0027
train end 0.5392777919769287
should be 1.0, predicted: [ 1.15646863], target predicted: [ 1.26639402]
should be 1.0, predicted: [ 0.61267394], target predicted: [ 0.43820801]
should be 1.0, predicted: [ 0.91227651], target predicted: [ 0.87096065]
start searching new action
get new action:  0.23801493644714355
start getting new_q
get new_q:  0.06628584861755371
train start:
critic loss [0.024747817, 0.016272079]
critic loss [0.023994604, 0.016164806]
critic loss [0.022708669, 0.015960813]
critic loss [0.021109417, 0.015680265]
critic loss [0.019434033, 0.01535424]
actor loss 75.4433
train end 0.5401778221130371
should be 1.0, predicted: [ 1.10986614], target predicted: [ 1.215

critic loss [0.024274051, 0.018231412]
critic loss [0.023876499, 0.018146627]
critic loss [0.022940308, 0.017903652]
critic loss [0.021645429, 0.017548438]
critic loss [0.020195475, 0.017136188]
actor loss 71.1042
train end 0.831322431564331
should be 1.0, predicted: [ 0.87459195], target predicted: [ 0.71370804]
should be 1.0, predicted: [ 0.78485978], target predicted: [ 0.7581923]
start searching new action
get new action:  0.23951292037963867
start getting new_q
get new_q:  0.06871318817138672
train start:
critic loss [0.021518391, 0.019477965]
critic loss [0.021269176, 0.019419089]
critic loss [0.020899985, 0.019310351]
critic loss [0.020429332, 0.019137025]
critic loss [0.019878836, 0.01888909]
actor loss 75.104
train end 1.0252833366394043
should be 1.0, predicted: [ 0.65128136], target predicted: [ 0.70362455]
start searching new action
get new action:  0.25196099281311035
start getting new_q
get new_q:  0.0633401870727539
train start:
critic loss [0.031215306, 0.031025087]
cri

get new action:  0.26726412773132324
start getting new_q
get new_q:  0.07925677299499512
train start:
critic loss [0.030654643, 0.028426886]
critic loss [0.028938897, 0.027084177]
critic loss [0.026751904, 0.025432611]
critic loss [0.024470765, 0.023708237]
critic loss [0.022409037, 0.022096422]
actor loss 62.1053
train end 0.7727963924407959
should be 1.0, predicted: [ 0.89377552], target predicted: [ 0.95880103]
start searching new action
get new action:  0.24653363227844238
start getting new_q
get new_q:  0.0720362663269043
train start:
critic loss [0.03216619, 0.029831514]
critic loss [0.032092758, 0.029401336]
critic loss [0.031801015, 0.028926862]
critic loss [0.031212006, 0.028341388]
critic loss [0.03031238, 0.027618108]
actor loss 29.8066
train end 0.681748628616333
should be 1.0, predicted: [ 0.77670926], target predicted: [ 0.7695291]
should be 1.0, predicted: [ 0.65643781], target predicted: [ 0.22120769]
Episode : 443 Replay Buffer 46292
TOTAL REWARD @ 443-th Episode  : Re

get new action:  0.23829150199890137
start getting new_q
get new_q:  0.06304693222045898
train start:
critic loss [0.13938539, 0.037051298]
critic loss [0.13026941, 0.035608418]
critic loss [0.1146051, 0.033370785]
critic loss [0.095209554, 0.030823052]
critic loss [0.076233901, 0.028569158]
actor loss 52.7375
train end 0.5492324829101562
should be -1.0, predicted: [-0.68454367], target predicted: [-0.25182366]
should be -1.0, predicted: [-0.1368257], target predicted: [ 0.26066512]
start searching new action
get new action:  0.2317054271697998
start getting new_q
get new_q:  0.0646815299987793
train start:
critic loss [0.023716919, 0.017787183]
critic loss [0.024189472, 0.018530488]
critic loss [0.024559401, 0.019322488]
critic loss [0.024658386, 0.019961022]
critic loss [0.024288002, 0.020282865]
actor loss 27.929
train end 0.5302808284759521
should be -1.0, predicted: [-0.78456092], target predicted: [-0.56782633]
should be 1.0, predicted: [ 0.73801148], target predicted: [ 0.809171

get new action:  0.22566652297973633
start getting new_q
get new_q:  0.07007646560668945
train start:
critic loss [0.023591373, 0.021617575]
critic loss [0.023294352, 0.021426708]
critic loss [0.02267234, 0.02108358]
critic loss [0.021832637, 0.020625312]
critic loss [0.020898167, 0.020093691]
actor loss 102.53
train end 0.5328457355499268
should be 1.0, predicted: [ 0.8815493], target predicted: [ 0.84218234]
should be 1.0, predicted: [ 1.08225226], target predicted: [ 0.97420192]
should be 1.0, predicted: [ 1.03855801], target predicted: [ 1.04034901]
start searching new action
get new action:  0.23559927940368652
start getting new_q
get new_q:  0.0664207935333252
train start:
critic loss [0.032927778, 0.021755749]
critic loss [0.030367667, 0.02112978]
critic loss [0.026982877, 0.020346066]
critic loss [0.023490746, 0.019503739]
critic loss [0.02048035, 0.0186731]
actor loss 50.1614
train end 0.517547607421875
should be 1.0, predicted: [ 0.8021611], target predicted: [ 0.81084359]
sh

actor loss 57.0983
train end 0.9033291339874268
should be -1.0, predicted: [-0.98573995], target predicted: [-0.93740273]
should be 1.0, predicted: [ 1.02633417], target predicted: [ 1.0908761]
should be 1.0, predicted: [ 0.78329116], target predicted: [ 0.82426924]
should be 1.0, predicted: [ 0.76170939], target predicted: [ 0.61958778]
should be 1.0, predicted: [ 0.94106793], target predicted: [ 0.83445418]
should be 1.0, predicted: [ 0.92904723], target predicted: [ 0.89684922]
should be -1.0, predicted: [-0.46918836], target predicted: [-0.58577365]
start searching new action
get new action:  0.24666333198547363
start getting new_q
get new_q:  0.06967282295227051
train start:
critic loss [0.035262354, 0.025874063]
critic loss [0.035358578, 0.026337029]
critic loss [0.034656554, 0.026687708]
critic loss [0.033251613, 0.026787741]
critic loss [0.031390171, 0.026614126]
actor loss 79.4937
train end 0.5769634246826172
should be 1.0, predicted: [ 1.09981406], target predicted: [ 0.77497

get new action:  0.23457980155944824
start getting new_q
get new_q:  0.06141376495361328
train start:
critic loss [0.032012202, 0.017940396]
critic loss [0.030810552, 0.017429689]
critic loss [0.028275846, 0.016543819]
critic loss [0.024952315, 0.015466617]
critic loss [0.021433378, 0.01439698]
actor loss 67.2278
train end 0.5282447338104248
should be 1.0, predicted: [ 1.05244946], target predicted: [ 0.87346441]
should be 1.0, predicted: [ 0.77434593], target predicted: [ 0.63833743]
should be -1.0, predicted: [-0.82568085], target predicted: [-0.82226622]
should be 1.0, predicted: [ 1.00905979], target predicted: [ 0.74924785]
start searching new action
get new action:  0.24212026596069336
start getting new_q
get new_q:  0.06931400299072266
train start:
critic loss [0.041163109, 0.028647939]
critic loss [0.038999215, 0.028283961]
critic loss [0.036480732, 0.027983988]
critic loss [0.034017906, 0.027812921]
critic loss [0.031916898, 0.027797151]
actor loss 15.2387
train end 0.58238387

critic loss [0.026362734, 0.021452541]
critic loss [0.024662927, 0.020908352]
critic loss [0.023013975, 0.020401988]
critic loss [0.021581011, 0.019956198]
actor loss 65.9719
train end 0.5071337223052979
should be -1.0, predicted: [-0.90469331], target predicted: [-0.76363641]
should be 1.0, predicted: [ 1.0400362], target predicted: [ 0.85617644]
should be -1.0, predicted: [-0.95343864], target predicted: [-0.6243915]
start searching new action
get new action:  0.22482872009277344
start getting new_q
get new_q:  0.06305432319641113
train start:
critic loss [0.035189062, 0.023131866]
critic loss [0.035203636, 0.023414046]
critic loss [0.034019016, 0.023234701]
critic loss [0.0319148, 0.022664651]
critic loss [0.029249828, 0.021821495]
actor loss 32.7981
train end 0.5098617076873779
should be 1.0, predicted: [ 0.62684053], target predicted: [ 0.87491757]
should be 1.0, predicted: [ 1.05245948], target predicted: [ 0.96811676]
should be 1.0, predicted: [ 0.99861532], target predicted: [ 

get new action:  0.2449479103088379
start getting new_q
get new_q:  0.0630197525024414
train start:
critic loss [0.026894879, 0.020436447]
critic loss [0.026089089, 0.020105761]
critic loss [0.02460606, 0.01962759]
critic loss [0.022748843, 0.019056991]
critic loss [0.020838283, 0.018453373]
actor loss 61.024
train end 0.5684230327606201
should be 1.0, predicted: [ 0.87838638], target predicted: [ 0.77053529]
start searching new action
get new action:  0.23061418533325195
start getting new_q
get new_q:  0.07099270820617676
train start:
critic loss [0.017378468, 0.016871333]
critic loss [0.017271563, 0.016898196]
critic loss [0.017113443, 0.01687674]
critic loss [0.01691591, 0.016793452]
critic loss [0.016690068, 0.016645011]
actor loss 38.2266
train end 0.5365982055664062
should be 1.0, predicted: [ 0.93274844], target predicted: [ 0.86799955]
start searching new action
get new action:  0.23881793022155762
start getting new_q
get new_q:  0.06318449974060059
train start:
critic loss [0.

critic loss [0.027633684, 0.026759556]
actor loss 0.409233
train end 0.5283334255218506
should be 1.0, predicted: [ 0.95090443], target predicted: [ 1.01257944]
should be 1.0, predicted: [ 0.93049914], target predicted: [ 0.96125305]
start searching new action
get new action:  0.2237529754638672
start getting new_q
get new_q:  0.06086611747741699
train start:
critic loss [0.05273905, 0.026322767]
critic loss [0.050317239, 0.025360722]
critic loss [0.046924736, 0.024320003]
critic loss [0.042872407, 0.023277506]
critic loss [0.038571142, 0.022317372]
actor loss 53.3694
train end 0.5293757915496826
should be 1.0, predicted: [ 0.87616783], target predicted: [ 0.66693199]
should be 1.0, predicted: [ 1.24117124], target predicted: [ 1.15215003]
should be -1.0, predicted: [-0.88331968], target predicted: [-0.73115724]
should be 1.0, predicted: [ 0.71405679], target predicted: [ 0.62211102]
should be -1.0, predicted: [-0.49352613], target predicted: [-0.44670781]
start searching new action
ge

Episode : 457 Replay Buffer 48059
TOTAL REWARD @ 457-th Episode  : Reward 1
Total Step: 127
start searching new action
get new action:  0.24935245513916016
start getting new_q
get new_q:  0.06847286224365234
train start:
critic loss [0.024324145, 0.024324145]
critic loss [0.024231117, 0.024231117]
critic loss [0.023924563, 0.023924563]
critic loss [0.023430819, 0.023430819]
critic loss [0.022786755, 0.022786755]
actor loss 47.3888
train end 0.6221489906311035
start searching new action
get new action:  0.25470566749572754
start getting new_q
get new_q:  0.06994986534118652
train start:
critic loss [0.022488553, 0.020331163]
critic loss [0.021388499, 0.01957082]
critic loss [0.019997764, 0.018694989]
critic loss [0.01855946, 0.01780282]
critic loss [0.017273348, 0.016963473]
actor loss 52.9935
train end 0.6759700775146484
should be 1.0, predicted: [ 1.12910068], target predicted: [ 0.96506447]
start searching new action
get new action:  0.24658942222595215
start getting new_q
get new_q:

critic loss [0.032793224, 0.022051057]
critic loss [0.031838413, 0.021832837]
critic loss [0.030394811, 0.021324385]
critic loss [0.028603476, 0.02058783]
actor loss 44.4003
train end 0.5144286155700684
should be 1.0, predicted: [ 1.1712178], target predicted: [ 1.02061653]
should be 1.0, predicted: [ 0.60776335], target predicted: [ 0.51572847]
Episode : 459 Replay Buffer 48200
TOTAL REWARD @ 459-th Episode  : Reward 1
Total Step: 8
start searching new action
get new action:  0.2391200065612793
start getting new_q
get new_q:  0.06678247451782227
train start:
critic loss [0.023027111, 0.023005715]
critic loss [0.022438556, 0.022407543]
critic loss [0.021743368, 0.021700528]
critic loss [0.020999422, 0.02094331]
critic loss [0.020258231, 0.020188764]
actor loss 40.798
train end 0.5128161907196045
should be 1.0, predicted: [ 1.11415827], target predicted: [ 0.83293808]
start searching new action
get new action:  0.23944306373596191
start getting new_q
get new_q:  0.06632232666015625
trai

critic loss [0.020942273, 0.020502407]
critic loss [0.020396531, 0.020031054]
critic loss [0.019507201, 0.01926172]
critic loss [0.018464433, 0.018336669]
actor loss 65.8382
train end 0.6239922046661377
should be 1.0, predicted: [ 1.04552877], target predicted: [ 1.02173007]
should be 1.0, predicted: [ 1.06690538], target predicted: [ 0.91925859]
start searching new action
get new action:  0.23950862884521484
start getting new_q
get new_q:  0.06652617454528809
train start:
critic loss [0.03302468, 0.020049699]
critic loss [0.03123193, 0.019727711]
critic loss [0.028733464, 0.019301742]
critic loss [0.02599187, 0.018841177]
critic loss [0.023423167, 0.018403348]
actor loss 28.9654
train end 0.5855484008789062
should be 1.0, predicted: [ 0.62814391], target predicted: [ 0.48834735]
should be 1.0, predicted: [ 1.22104001], target predicted: [ 1.10552526]
should be -1.0, predicted: [-1.0836575], target predicted: [-1.13154411]
should be -1.0, predicted: [-0.86451906], target predicted: [-0

get new action:  0.24799680709838867
start getting new_q
get new_q:  0.07105016708374023
train start:
critic loss [0.031456944, 0.01695868]
critic loss [0.029374652, 0.016449306]
critic loss [0.026563128, 0.015861733]
critic loss [0.023788739, 0.015300651]
critic loss [0.021036558, 0.014772671]
actor loss 25.2568
train end 0.6087043285369873
should be -1.0, predicted: [-0.99059194], target predicted: [-1.01711237]
should be 1.0, predicted: [ 0.60149318], target predicted: [ 0.37571779]
start searching new action
get new action:  0.24769306182861328
start getting new_q
get new_q:  0.07567667961120605
train start:
critic loss [0.018144598, 0.01813497]
critic loss [0.017710101, 0.017656211]
critic loss [0.017234784, 0.017116601]
critic loss [0.016739711, 0.016558461]
critic loss [0.016233256, 0.016006237]
actor loss 72.837
train end 0.6664423942565918
should be -1.0, predicted: [-1.05171287], target predicted: [-1.14684331]
start searching new action
get new action:  0.2312147617340088
st

should be 1.0, predicted: [ 1.41891479], target predicted: [ 1.04498088]
start searching new action
get new action:  0.24139833450317383
start getting new_q
get new_q:  0.06721639633178711
train start:
critic loss [0.041401897, 0.0355848]
critic loss [0.041430451, 0.035635084]
critic loss [0.040818121, 0.035278387]
critic loss [0.039715543, 0.034629334]
critic loss [0.038230721, 0.033740066]
actor loss 53.5641
train end 0.6063346862792969
should be -1.0, predicted: [-0.45712057], target predicted: [-0.85817689]
should be 1.0, predicted: [ 0.85313332], target predicted: [ 0.4106099]
start searching new action
get new action:  0.25606322288513184
start getting new_q
get new_q:  0.07948851585388184
train start:
critic loss [0.021956008, 0.021956008]
critic loss [0.021710385, 0.021710385]
critic loss [0.021290515, 0.021290515]
critic loss [0.020721968, 0.020721968]
critic loss [0.020041142, 0.020041142]
actor loss 49.9287
train end 0.9703149795532227
start searching new action
get new acti

critic loss [0.030037709, 0.020117205]
critic loss [0.029730424, 0.020189537]
critic loss [0.027995102, 0.019946355]
critic loss [0.026161551, 0.019493181]
actor loss 17.2582
train end 0.6553206443786621
should be -1.0, predicted: [-0.4801819], target predicted: [-0.49608088]
start searching new action
get new action:  0.23710322380065918
start getting new_q
get new_q:  0.07052826881408691
train start:
critic loss [0.03184735, 0.025189932]
critic loss [0.029844858, 0.024181472]
critic loss [0.026665803, 0.022678461]
critic loss [0.023331579, 0.021066751]
critic loss [0.020684525, 0.01966029]
actor loss 2.63965
train end 0.5651564598083496
should be 1.0, predicted: [ 1.0157758], target predicted: [ 1.0202049]
should be 1.0, predicted: [ 0.99056536], target predicted: [ 0.93062884]
should be 1.0, predicted: [ 1.01160204], target predicted: [ 0.92596823]
should be 1.0, predicted: [ 0.5756889], target predicted: [ 0.87757808]
should be 1.0, predicted: [ 0.962322], target predicted: [ 0.920

critic loss [0.036196537, 0.032228038]
actor loss 20.0416
train end 0.5712141990661621
should be -1.0, predicted: [-0.93046373], target predicted: [-0.93217683]
should be 1.0, predicted: [ 0.46489361], target predicted: [ 0.71626365]
should be 1.0, predicted: [ 1.00592041], target predicted: [ 1.3584398]
Episode : 468 Replay Buffer 49389
TOTAL REWARD @ 468-th Episode  : Reward -1
Total Step: 131
start searching new action
get new action:  0.25444746017456055
start getting new_q
get new_q:  0.07218623161315918
train start:
critic loss [0.036220383, 0.021142755]
critic loss [0.03490413, 0.020893546]
critic loss [0.032819226, 0.020597309]
critic loss [0.030374387, 0.020297691]
critic loss [0.027830578, 0.020009091]
actor loss 25.7017
train end 0.5664324760437012
should be 1.0, predicted: [ 0.4696627], target predicted: [ 0.35436526]
start searching new action
get new action:  0.2562119960784912
start getting new_q
get new_q:  0.07298851013183594
train start:
critic loss [0.036452044, 0.02

critic loss [0.023564883, 0.023470715]
actor loss 52.557
train end 0.5240888595581055
should be 1.0, predicted: [ 0.94563138], target predicted: [ 0.91530436]
trained action prob map predicted by initial model for a starting game
[[[ 0.34225729  0.00837149  0.0083886   0.00838426  0.00835837  0.00838027
    0.00838408]
  [ 0.00838556  0.00838347  0.00836383  0.00836448  0.00838466  0.00838008
    0.0083771 ]
  [ 0.00838039  0.00838449  0.00837394  0.00838842  0.00838056  0.00838865
    0.0083874 ]
  [ 0.00837616  0.00836446  0.00837617  0.00838455  0.00836177  0.00837925
    0.00838882]
  [ 0.00839291  0.0083737   0.00835809  0.00839005  0.00837613  0.00836973
    0.00836194]
  [ 0.00837984  0.00838028  0.00837866  0.00837277  0.00838974  0.00838527
    0.00838199]
  [ 0.00838632  0.0083884   0.00836775  0.00838403  0.00838193  0.00837994
    0.26394194]]

 [[ 0.00358066  0.03446642  0.05707518  0.00357939  0.0035794   0.00358686
    0.00358056]
  [ 0.04369732  0.0334226   0.08078754  

get new action:  0.23488593101501465
start getting new_q
get new_q:  0.08283257484436035
train start:
critic loss [0.036916964, 0.024884969]
critic loss [0.035360493, 0.024786429]
critic loss [0.032569967, 0.024304196]
critic loss [0.029243909, 0.023588147]
critic loss [0.026034707, 0.022779645]
actor loss 110.886
train end 0.53023362159729
should be 1.0, predicted: [ 1.01215672], target predicted: [ 1.01488721]
should be 1.0, predicted: [ 1.03339911], target predicted: [ 0.91797918]
should be 1.0, predicted: [ 0.95183903], target predicted: [ 0.71465844]
should be 1.0, predicted: [ 0.98731703], target predicted: [ 0.7447744]
start searching new action
get new action:  0.2422475814819336
start getting new_q
get new_q:  0.06710076332092285
train start:
critic loss [0.028443122, 0.025478158]
critic loss [0.028255783, 0.025671571]
critic loss [0.027970463, 0.025787946]
critic loss [0.027615106, 0.025826283]
critic loss [0.027199406, 0.02577183]
actor loss 28.5325
train end 0.6620194911956

critic loss [0.052004568, 0.02427692]
critic loss [0.04447056, 0.022518529]
critic loss [0.036651406, 0.020959677]
critic loss [0.029920451, 0.01993517]
actor loss 20.7495
train end 0.4945700168609619
should be -1.0, predicted: [-0.94585299], target predicted: [-0.7012552]
should be 1.0, predicted: [ 0.83327049], target predicted: [ 0.47505748]
should be -1.0, predicted: [-0.97726578], target predicted: [-0.75188649]
start searching new action
get new action:  0.22650551795959473
start getting new_q
get new_q:  0.07228398323059082
train start:
critic loss [0.054030143, 0.023044441]
critic loss [0.050508268, 0.023057364]
critic loss [0.045602992, 0.023098394]
critic loss [0.040287629, 0.023107424]
critic loss [0.035199799, 0.023018967]
actor loss 3.06684
train end 0.5051040649414062
should be 1.0, predicted: [ 0.71736372], target predicted: [ 0.74318779]
should be -1.0, predicted: [-1.28669], target predicted: [-0.56473827]
should be 1.0, predicted: [ 0.73807812], target predicted: [ 0.

actor loss 83.5239
train end 0.7742834091186523
should be 1.0, predicted: [ 0.98552406], target predicted: [ 0.95372379]
should be 1.0, predicted: [ 1.07824731], target predicted: [ 1.26563048]
start searching new action
get new action:  0.24474287033081055
start getting new_q
get new_q:  0.07156133651733398
train start:
critic loss [0.030172676, 0.019113343]
critic loss [0.029290367, 0.018589176]
critic loss [0.027412195, 0.018028177]
critic loss [0.024978906, 0.017515369]
critic loss [0.022429574, 0.017097037]
actor loss 50.9491
train end 0.6638433933258057
should be 1.0, predicted: [ 0.99962157], target predicted: [ 0.96688509]
should be 1.0, predicted: [ 0.60603094], target predicted: [ 0.63126481]
should be 1.0, predicted: [ 1.10035717], target predicted: [ 0.97830373]
start searching new action
get new action:  0.25153422355651855
start getting new_q
get new_q:  0.06619739532470703
train start:
critic loss [0.045105606, 0.032293402]
critic loss [0.044398576, 0.032637641]
critic l

critic loss [0.044130236, 0.024643853]
critic loss [0.039242163, 0.02432321]
critic loss [0.034177385, 0.024208216]
critic loss [0.029940449, 0.024325214]
actor loss 51.0126
train end 0.5294854640960693
should be 1.0, predicted: [ 0.7902509], target predicted: [ 0.58789396]
should be 1.0, predicted: [ 0.83519137], target predicted: [ 0.71703929]
should be 1.0, predicted: [ 1.01726592], target predicted: [ 0.92970872]
start searching new action
get new action:  0.2594265937805176
start getting new_q
get new_q:  0.06867551803588867
train start:
critic loss [0.036406733, 0.035465214]
critic loss [0.035967346, 0.03523314]
critic loss [0.034981683, 0.034427304]
critic loss [0.033480242, 0.033071976]
critic loss [0.031546216, 0.031249974]
actor loss 35.7504
train end 0.512291669845581
should be 1.0, predicted: [ 1.00111985], target predicted: [ 0.7736311]
start searching new action
get new action:  0.23161959648132324
start getting new_q
get new_q:  0.07207965850830078
train start:
critic lo

get new action:  0.23641180992126465
start getting new_q
get new_q:  0.06563496589660645
train start:
critic loss [0.041503631, 0.026510805]
critic loss [0.039595157, 0.026318286]
critic loss [0.037340768, 0.026112037]
critic loss [0.034910064, 0.025841875]
critic loss [0.032372676, 0.025410738]
actor loss 19.0874
train end 0.5563559532165527
should be 1.0, predicted: [ 0.85809648], target predicted: [ 0.98777354]
should be -1.0, predicted: [-0.69181967], target predicted: [-0.7337724]
should be 1.0, predicted: [ 0.60860276], target predicted: [ 0.62339294]
should be 1.0, predicted: [ 0.69017935], target predicted: [ 0.66029501]
should be 1.0, predicted: [ 0.98360109], target predicted: [ 0.6842736]
Episode : 479 Replay Buffer 50000
TOTAL REWARD @ 479-th Episode  : Reward -1
Total Step: 98
start searching new action
get new action:  0.24586224555969238
start getting new_q
get new_q:  0.07261157035827637
train start:
critic loss [0.041877158, 0.032494076]
critic loss [0.039559413, 0.032

actor loss 30.94
train end 0.6603941917419434
should be 1.0, predicted: [ 0.93854111], target predicted: [ 0.82887495]
should be -1.0, predicted: [-0.7017501], target predicted: [-0.86557782]
should be 1.0, predicted: [ 1.02470994], target predicted: [ 1.11657536]
start searching new action
get new action:  0.2517271041870117
start getting new_q
get new_q:  0.07472968101501465
train start:
critic loss [0.054734297, 0.03559389]
critic loss [0.049237199, 0.034153897]
critic loss [0.043741331, 0.032900814]
critic loss [0.039444748, 0.03212123]
critic loss [0.036797419, 0.031856693]
actor loss 4.8864
train end 0.7732734680175781
should be -1.0, predicted: [-1.22828126], target predicted: [-0.8097434]
should be 1.0, predicted: [ 0.98154479], target predicted: [ 1.00597036]
should be 1.0, predicted: [ 0.55306393], target predicted: [ 0.43281224]
should be -1.0, predicted: [-1.14667165], target predicted: [-1.11586273]
start searching new action
get new action:  0.24782967567443848
start gett

actor loss 32.6859
train end 0.72635817527771
start searching new action
get new action:  0.24116730690002441
start getting new_q
get new_q:  0.06866049766540527
train start:
critic loss [0.029036587, 0.025165191]
critic loss [0.02818385, 0.024589704]
critic loss [0.027013162, 0.023963103]
critic loss [0.025658682, 0.023298543]
critic loss [0.024271131, 0.022625884]
actor loss 8.12791
train end 0.559283971786499
should be 1.0, predicted: [ 0.86257905], target predicted: [ 0.71833265]
start searching new action
get new action:  0.2284705638885498
start getting new_q
get new_q:  0.06432104110717773
train start:
critic loss [0.040243212, 0.031169809]
critic loss [0.039884254, 0.030745875]
critic loss [0.03820385, 0.029626876]
critic loss [0.035566639, 0.028026422]
critic loss [0.032453455, 0.026213596]
actor loss 48.2241
train end 0.6266851425170898
should be 1.0, predicted: [ 0.77833474], target predicted: [ 0.68356764]
should be 1.0, predicted: [ 0.9492408], target predicted: [ 0.977128

critic loss [0.040378287, 0.02831335]
critic loss [0.039034754, 0.027913151]
critic loss [0.037475053, 0.02742986]
critic loss [0.035734463, 0.026867555]
actor loss 1.18354
train end 0.6212611198425293
should be -1.0, predicted: [-0.81472218], target predicted: [-0.52240199]
should be -1.0, predicted: [-1.26121879], target predicted: [-1.2435174]
should be 1.0, predicted: [ 0.71845704], target predicted: [ 0.80171269]
should be -1.0, predicted: [-1.09465575], target predicted: [-1.07528925]
should be 1.0, predicted: [ 0.36133218], target predicted: [ 0.58958429]
start searching new action
get new action:  0.2760941982269287
start getting new_q
get new_q:  0.07801294326782227
train start:
critic loss [0.043256339, 0.024991378]
critic loss [0.042118527, 0.024627915]
critic loss [0.039775789, 0.024127349]
critic loss [0.036683381, 0.023571655]
critic loss [0.033360898, 0.023055606]
actor loss 12.2251
train end 0.6751844882965088
should be 1.0, predicted: [ 0.44311655], target predicted: [

start searching new action
get new action:  0.26499295234680176
start getting new_q
get new_q:  0.06739497184753418
train start:
critic loss [0.040463738, 0.026914377]
critic loss [0.038999848, 0.026783129]
critic loss [0.036715422, 0.026469681]
critic loss [0.03399862, 0.026022544]
critic loss [0.031260572, 0.025496006]
actor loss 41.1519
train end 0.668257474899292
should be 1.0, predicted: [ 0.80716866], target predicted: [ 0.64691108]
should be -1.0, predicted: [-0.64577943], target predicted: [-0.38639572]
Episode : 486 Replay Buffer 50000
TOTAL REWARD @ 486-th Episode  : Reward 1
Total Step: 20
start searching new action
get new action:  0.24457907676696777
start getting new_q
get new_q:  0.06549882888793945
train start:
critic loss [0.023070822, 0.020431882]
critic loss [0.022719968, 0.02016132]
critic loss [0.021942709, 0.019660546]
critic loss [0.020816451, 0.018949825]
critic loss [0.019470423, 0.018087596]
actor loss 46.1197
train end 0.6497442722320557
should be 1.0, predic

critic loss [0.019765131, 0.016818671]
actor loss 58.4091
train end 0.5264987945556641
should be -1.0, predicted: [-1.12920713], target predicted: [-0.88730305]
should be 1.0, predicted: [ 0.80698013], target predicted: [ 0.98530251]
should be -1.0, predicted: [-0.98479509], target predicted: [-0.92111522]
should be 1.0, predicted: [ 1.03113925], target predicted: [ 0.9583239]
start searching new action
get new action:  0.24855995178222656
start getting new_q
get new_q:  0.08699154853820801
train start:
critic loss [0.023290548, 0.018975561]
critic loss [0.022361491, 0.018849272]
critic loss [0.021176057, 0.018680461]
critic loss [0.019973375, 0.018475004]
critic loss [0.018948125, 0.018233187]
actor loss 13.2609
train end 0.7039289474487305
should be 1.0, predicted: [ 0.79550505], target predicted: [ 0.92543852]
should be -1.0, predicted: [-1.20131683], target predicted: [-0.65367621]
should be 1.0, predicted: [ 0.68609655], target predicted: [ 0.96884304]
start searching new action
g

actor loss 44.8642
train end 0.6597800254821777
should be -1.0, predicted: [-0.93609017], target predicted: [-0.76081109]
should be -1.0, predicted: [-0.43677586], target predicted: [-0.33270693]
start searching new action
get new action:  0.23774218559265137
start getting new_q
get new_q:  0.06785249710083008
train start:
critic loss [0.027455917, 0.019064508]
critic loss [0.027802192, 0.019142013]
critic loss [0.027082795, 0.019034451]
critic loss [0.025433306, 0.018723357]
critic loss [0.02321885, 0.018241098]
actor loss 38.5489
train end 0.8118503093719482
should be 1.0, predicted: [ 0.9900316], target predicted: [ 0.8817144]
should be 1.0, predicted: [ 0.78792554], target predicted: [ 0.97859651]
should be 1.0, predicted: [ 0.37208331], target predicted: [ 0.76994938]
start searching new action
get new action:  0.2367873191833496
start getting new_q
get new_q:  0.06622695922851562
train start:
critic loss [0.031936042, 0.024157759]
critic loss [0.029291954, 0.022897901]
critic los

get new action:  0.23181676864624023
start getting new_q
get new_q:  0.06602621078491211
train start:
critic loss [0.032356039, 0.024335356]
critic loss [0.03163211, 0.024387788]
critic loss [0.030302225, 0.024248436]
critic loss [0.02859861, 0.023948511]
critic loss [0.026753953, 0.023530986]
actor loss 60.7754
train end 0.5786483287811279
should be -1.0, predicted: [-0.00321247], target predicted: [-0.59778142]
Episode : 491 Replay Buffer 50000
TOTAL REWARD @ 491-th Episode  : Reward 1
Total Step: 130
start searching new action
get new action:  0.3121175765991211
start getting new_q
get new_q:  0.07467913627624512
train start:
critic loss [0.024095725, 0.024095725]
critic loss [0.023856398, 0.023856398]
critic loss [0.023456804, 0.023456804]
critic loss [0.022917688, 0.022917688]
critic loss [0.022271208, 0.022271208]
actor loss 59.5486
train end 0.6495234966278076
start searching new action
get new action:  0.2393782138824463
start getting new_q
get new_q:  0.0678403377532959
train 

critic loss [0.019247489, 0.017099168]
actor loss 68.9141
train end 0.5846295356750488
should be 1.0, predicted: [ 1.27335656], target predicted: [ 0.91186386]
should be 1.0, predicted: [ 0.97788471], target predicted: [ 0.94957113]
should be 1.0, predicted: [ 1.31470764], target predicted: [ 0.72940981]
Episode : 493 Replay Buffer 50000
TOTAL REWARD @ 493-th Episode  : Reward -1
Total Step: 122
start searching new action
get new action:  0.26908040046691895
start getting new_q
get new_q:  0.07459259033203125
train start:
critic loss [0.032963827, 0.028883072]
critic loss [0.032098878, 0.028388076]
critic loss [0.030742548, 0.027662]
critic loss [0.029225953, 0.026864696]
critic loss [0.027809363, 0.026121469]
actor loss 30.7012
train end 0.7240548133850098
should be 1.0, predicted: [ 1.29042685], target predicted: [ 0.94039428]
should be 1.0, predicted: [ 1.33624744], target predicted: [ 0.86267781]
should be 1.0, predicted: [ 0.96070266], target predicted: [ 0.9585712]
should be 1.0,

critic loss [0.020531435, 0.020531435]
critic loss [0.019858558, 0.019858558]
critic loss [0.01905008, 0.01905008]
actor loss 41.6365
train end 0.7037882804870605
start searching new action
get new action:  0.24198293685913086
start getting new_q
get new_q:  0.06491827964782715
train start:
critic loss [0.035501964, 0.021548437]
critic loss [0.033580311, 0.020906255]
critic loss [0.03018415, 0.019766133]
critic loss [0.026242416, 0.018483076]
critic loss [0.021669649, 0.017137634]
actor loss 56.5946
train end 0.5660207271575928
should be 1.0, predicted: [ 0.68999386], target predicted: [ 0.75070894]
should be 1.0, predicted: [ 0.62778771], target predicted: [ 0.6409173]
should be 1.0, predicted: [ 0.72038054], target predicted: [ 0.74031901]
Episode : 495 Replay Buffer 50000
TOTAL REWARD @ 495-th Episode  : Reward -1
Total Step: 129
start searching new action
get new action:  0.26740145683288574
start getting new_q
get new_q:  0.07209420204162598
train start:
critic loss [0.026352299, 

critic loss [0.021828508, 0.019950701]
critic loss [0.021726156, 0.01987483]
critic loss [0.021215692, 0.019522702]
critic loss [0.020355958, 0.018934321]
actor loss 19.4518
train end 0.5553765296936035
should be -1.0, predicted: [-0.79926807], target predicted: [-0.84996516]
should be 1.0, predicted: [ 1.20859802], target predicted: [ 0.93904006]
start searching new action
get new action:  0.29192018508911133
start getting new_q
get new_q:  0.0821847915649414
train start:
critic loss [0.026239745, 0.026239745]
critic loss [0.025440061, 0.025440061]
critic loss [0.02409837, 0.02409837]
critic loss [0.022436183, 0.022436183]
critic loss [0.020685514, 0.020685514]
actor loss -24.888
train end 0.6355187892913818
Episode : 497 Replay Buffer 50000
TOTAL REWARD @ 497-th Episode  : Reward -1
Total Step: 127
start searching new action
get new action:  0.26151275634765625
start getting new_q
get new_q:  0.0734410285949707
train start:
critic loss [0.03979893, 0.029174242]
critic loss [0.0380837

start searching new action
get new action:  0.24980998039245605
start getting new_q
get new_q:  0.07287025451660156
train start:
critic loss [0.02559837, 0.018127268]
critic loss [0.025181813, 0.01862736]
critic loss [0.023901619, 0.018833939]
critic loss [0.022200458, 0.0187933]
critic loss [0.020474207, 0.01855555]
actor loss 17.819
train end 0.5785439014434814
should be 1.0, predicted: [ 0.4815647], target predicted: [ 0.53330886]
should be -1.0, predicted: [-1.0959729], target predicted: [-0.95129895]
start searching new action
get new action:  0.24743103981018066
start getting new_q
get new_q:  0.06646609306335449
train start:
critic loss [0.047646131, 0.024478089]
critic loss [0.046914145, 0.024271894]
critic loss [0.043265846, 0.023270216]
critic loss [0.037719462, 0.021751065]
critic loss [0.031522367, 0.02006137]
actor loss 32.1927
train end 0.5245850086212158
should be 1.0, predicted: [ 1.00075507], target predicted: [ 0.96912718]
should be -1.0, predicted: [-0.958269], targe

critic loss [0.036450196, 0.027678978]
critic loss [0.033539195, 0.026637642]
critic loss [0.030069165, 0.025257945]
critic loss [0.026677579, 0.02377924]
actor loss 38.6277
train end 0.815502405166626
should be 1.0, predicted: [ 0.97041255], target predicted: [ 0.92123342]
should be 1.0, predicted: [ 1.15583241], target predicted: [ 0.90932393]
start searching new action
get new action:  0.28186464309692383
start getting new_q
get new_q:  0.09978914260864258
train start:
critic loss [0.03398956, 0.024828909]
critic loss [0.03112613, 0.023000199]
critic loss [0.028320035, 0.021360952]
critic loss [0.02599059, 0.020039998]
critic loss [0.024313472, 0.019081019]
actor loss 58.9429
train end 1.0086088180541992
should be 1.0, predicted: [ 1.17485988], target predicted: [ 1.22721243]
should be 1.0, predicted: [ 1.01861107], target predicted: [ 0.66158104]
should be 1.0, predicted: [ 1.00482118], target predicted: [ 0.83547884]
should be 1.0, predicted: [ 0.75373852], target predicted: [ 0.5

Episode : 502 Replay Buffer 50000
TOTAL REWARD @ 502-th Episode  : Reward -1
Total Step: 116
start searching new action
get new action:  0.25821924209594727
start getting new_q
get new_q:  0.06768321990966797
train start:
critic loss [0.028820412, 0.027776955]
critic loss [0.030196846, 0.028705006]
critic loss [0.031103171, 0.029212669]
critic loss [0.031276636, 0.029195178]
critic loss [0.030636996, 0.028641663]
actor loss 16.5554
train end 0.5205731391906738
should be 1.0, predicted: [ 0.94378215], target predicted: [ 0.92647916]
should be 1.0, predicted: [ 0.77593464], target predicted: [ 0.68110031]
should be 1.0, predicted: [ 0.85975403], target predicted: [ 0.88244057]
should be 1.0, predicted: [ 0.87798095], target predicted: [ 0.99702543]
start searching new action
get new action:  0.24596905708312988
start getting new_q
get new_q:  0.07694721221923828
train start:
critic loss [0.026943617, 0.026200883]
critic loss [0.026206724, 0.025478845]
critic loss [0.025203414, 0.02456867

get new action:  0.2497255802154541
start getting new_q
get new_q:  0.06676387786865234
train start:
critic loss [0.034732737, 0.021130316]
critic loss [0.02596809, 0.019947033]
critic loss [0.020245841, 0.019056046]
critic loss [0.019642642, 0.018660918]
critic loss [0.019093281, 0.018287597]
actor loss 25.2701
train end 0.5647919178009033
should be 1.0, predicted: [ 0.84599131], target predicted: [ 1.00191104]
should be 1.0, predicted: [ 0.95879948], target predicted: [ 0.88664007]
should be 1.0, predicted: [ 0.87815648], target predicted: [ 0.93606615]
Episode : 504 Replay Buffer 50000
TOTAL REWARD @ 504-th Episode  : Reward -1
Total Step: 136
start searching new action
get new action:  0.27620673179626465
start getting new_q
get new_q:  0.08624887466430664
train start:
critic loss [0.016448632, 0.01451418]
critic loss [0.016649593, 0.014517821]
critic loss [0.016544685, 0.014385927]
critic loss [0.01613814, 0.014105251]
critic loss [0.015572871, 0.013785001]
actor loss 20.552
train

get new action:  0.2335202693939209
start getting new_q
get new_q:  0.0628502368927002
train start:
critic loss [0.030422868, 0.022869099]
critic loss [0.029632758, 0.022476155]
critic loss [0.028285202, 0.022010216]
critic loss [0.026624307, 0.021526657]
critic loss [0.024928454, 0.021074209]
actor loss 45.1655
train end 0.5275065898895264
should be 1.0, predicted: [ 0.63867915], target predicted: [ 0.60546166]
start searching new action
get new action:  0.2322859764099121
start getting new_q
get new_q:  0.06322884559631348
train start:
critic loss [0.090614595, 0.026358115]
critic loss [0.081182994, 0.025280781]
critic loss [0.067297354, 0.023693711]
critic loss [0.052761517, 0.022111086]
critic loss [0.040320285, 0.020906378]
actor loss 47.7391
train end 0.517812967300415
should be -1.0, predicted: [-0.90352607], target predicted: [-1.13693964]
should be 1.0, predicted: [ 0.99857551], target predicted: [ 0.9207381]
should be 1.0, predicted: [ 0.74116385], target predicted: [ 0.62535

critic loss [0.02169453, 0.019039236]
actor loss 3.35433
train end 0.592111349105835
should be -1.0, predicted: [-0.90167773], target predicted: [-0.89368296]
should be 1.0, predicted: [ 0.77248466], target predicted: [ 1.09741867]
should be -1.0, predicted: [-1.14518774], target predicted: [-1.28869772]
should be 1.0, predicted: [ 1.08702564], target predicted: [ 1.01802671]
start searching new action
get new action:  0.23868441581726074
start getting new_q
get new_q:  0.06898999214172363
train start:
critic loss [0.063342243, 0.026589179]
critic loss [0.060903739, 0.026039233]
critic loss [0.056880228, 0.025142696]
critic loss [0.051800665, 0.024016727]
critic loss [0.046074618, 0.022770379]
actor loss 14.2611
train end 0.5451233386993408
should be 1.0, predicted: [ 0.36051071], target predicted: [ 0.40885437]
should be -1.0, predicted: [-0.56960434], target predicted: [-0.56570268]
should be 1.0, predicted: [ 0.76682502], target predicted: [ 0.85684139]
start searching new action
ge

get new action:  0.2629678249359131
start getting new_q
get new_q:  0.07092881202697754
train start:
critic loss [0.025041349, 0.021732274]
critic loss [0.024496976, 0.02139245]
critic loss [0.023384685, 0.020886077]
critic loss [0.021920316, 0.020240666]
critic loss [0.020366317, 0.019490475]
actor loss 83.6037
train end 0.6195242404937744
should be 1.0, predicted: [ 0.97119313], target predicted: [ 0.47399044]
should be 1.0, predicted: [ 0.9494465], target predicted: [ 0.52859938]
start searching new action
get new action:  0.2506139278411865
start getting new_q
get new_q:  0.06827259063720703
train start:
critic loss [0.032478064, 0.019972533]
critic loss [0.030354256, 0.019215642]
critic loss [0.027519058, 0.018269928]
critic loss [0.024447812, 0.017270679]
critic loss [0.021520697, 0.016322058]
actor loss 55.7251
train end 0.5475199222564697
should be -1.0, predicted: [-0.99883014], target predicted: [-0.52438271]
should be 1.0, predicted: [ 0.51330262], target predicted: [ 0.4204

Episode : 511 Replay Buffer 50000
TOTAL REWARD @ 511-th Episode  : Reward 1
Total Step: 157
start searching new action
get new action:  0.2549562454223633
start getting new_q
get new_q:  0.07835841178894043
train start:
critic loss [0.03339887, 0.030009991]
critic loss [0.031072956, 0.027885737]
critic loss [0.028914534, 0.025901528]
critic loss [0.027109759, 0.024178606]
critic loss [0.025730649, 0.022764331]
actor loss 49.4309
train end 0.6776578426361084
should be 1.0, predicted: [ 0.83575761], target predicted: [ 0.87335575]
should be 1.0, predicted: [ 0.70120156], target predicted: [ 0.72208619]
should be 1.0, predicted: [ 0.5651716], target predicted: [ 0.5911355]
start searching new action
get new action:  0.24173474311828613
start getting new_q
get new_q:  0.06780600547790527
train start:
critic loss [0.049421739, 0.020093504]
critic loss [0.045338519, 0.019327691]
critic loss [0.039729126, 0.018433271]
critic loss [0.033751149, 0.017626636]
critic loss [0.028375946, 0.01705702

get new action:  0.23857402801513672
start getting new_q
get new_q:  0.06063580513000488
train start:
critic loss [0.030600838, 0.018569358]
critic loss [0.027901223, 0.017954245]
critic loss [0.024837168, 0.01722673]
critic loss [0.021817427, 0.016476639]
critic loss [0.01922364, 0.015809083]
actor loss 46.6564
train end 0.5256757736206055
should be 1.0, predicted: [ 1.05428827], target predicted: [ 0.77246636]
should be 1.0, predicted: [ 0.9917165], target predicted: [ 0.76385063]
should be 1.0, predicted: [ 0.79756409], target predicted: [ 0.57846099]
Episode : 513 Replay Buffer 50000
TOTAL REWARD @ 513-th Episode  : Reward 1
Total Step: 140
start searching new action
get new action:  0.25948238372802734
start getting new_q
get new_q:  0.07592511177062988
train start:
critic loss [0.020498089, 0.019917995]
critic loss [0.020672534, 0.020407367]
critic loss [0.020638419, 0.020473238]
critic loss [0.020312343, 0.020120475]
critic loss [0.019674677, 0.019415181]
actor loss 46.9484
trai

get new action:  0.2425861358642578
start getting new_q
get new_q:  0.0663754940032959
train start:
critic loss [0.050090007, 0.020068664]
critic loss [0.050896257, 0.020280313]
critic loss [0.048054911, 0.020017691]
critic loss [0.042571213, 0.019391984]
critic loss [0.035850469, 0.018595997]
actor loss 72.664
train end 0.6022045612335205
should be 1.0, predicted: [ 0.69827288], target predicted: [ 0.25978172]
should be 1.0, predicted: [ 1.20490241], target predicted: [ 0.9606474]
should be 1.0, predicted: [ 0.93703967], target predicted: [ 0.44415426]
should be 1.0, predicted: [ 1.28215957], target predicted: [ 0.91270775]
start searching new action
get new action:  0.2508108615875244
start getting new_q
get new_q:  0.06964707374572754
train start:
critic loss [0.13851538, 0.031664141]
critic loss [0.12890647, 0.03043494]
critic loss [0.10863814, 0.027865525]
critic loss [0.084438212, 0.024807945]
critic loss [0.06168887, 0.021990102]
actor loss 27.386
train end 0.5670220851898193
sh

actor loss 37.4475
train end 0.7626140117645264
should be 1.0, predicted: [ 0.91683424], target predicted: [ 0.96225303]
should be 1.0, predicted: [ 0.88621968], target predicted: [ 0.44080061]
should be -1.0, predicted: [ 0.05177631], target predicted: [-0.13121414]
should be 1.0, predicted: [ 1.29285848], target predicted: [ 0.72098905]
start searching new action
get new action:  0.24251651763916016
start getting new_q
get new_q:  0.06209135055541992
train start:
critic loss [0.038935024, 0.02037934]
critic loss [0.036375433, 0.019772597]
critic loss [0.032436132, 0.018873302]
critic loss [0.027825799, 0.017814957]
critic loss [0.023335814, 0.016768048]
actor loss 54.8168
train end 0.6517457962036133
should be 1.0, predicted: [ 0.93863338], target predicted: [ 0.73866439]
should be 1.0, predicted: [ 1.11218119], target predicted: [ 0.89811158]
start searching new action
get new action:  0.22760415077209473
start getting new_q
get new_q:  0.0670473575592041
train start:
critic loss [0

get new action:  0.2351670265197754
start getting new_q
get new_q:  0.07337117195129395
train start:
critic loss [0.067689136, 0.034567658]
critic loss [0.065996818, 0.034116343]
critic loss [0.061845385, 0.033097316]
critic loss [0.056163065, 0.031710826]
critic loss [0.050111979, 0.030213663]
actor loss 41.097
train end 0.7165489196777344
should be -1.0, predicted: [-1.06910408], target predicted: [-0.90317678]
should be -1.0, predicted: [-1.18360519], target predicted: [-1.13543093]
should be -1.0, predicted: [-0.44656509], target predicted: [-0.42459309]
should be 1.0, predicted: [ 0.77871382], target predicted: [ 0.46837789]
should be -1.0, predicted: [-0.34918159], target predicted: [-0.57912648]
start searching new action
get new action:  0.25599241256713867
start getting new_q
get new_q:  0.06779623031616211
train start:
critic loss [0.034872383, 0.01935803]
critic loss [0.03350037, 0.01886937]
critic loss [0.031093076, 0.018313346]
critic loss [0.028135266, 0.017757576]
critic

Episode : 520 Replay Buffer 50000
TOTAL REWARD @ 520-th Episode  : Reward 1
Total Step: 125
start searching new action
get new action:  0.28998851776123047
start getting new_q
get new_q:  0.08914899826049805
train start:
critic loss [0.026355859, 0.026291188]
critic loss [0.027069708, 0.026957754]
critic loss [0.027315652, 0.027192745]
critic loss [0.027123543, 0.027029835]
critic loss [0.02657146, 0.026527574]
actor loss -22.4824
train end 0.805004358291626
should be -1.0, predicted: [-1.1014595], target predicted: [-1.2032932]
start searching new action
get new action:  0.24851107597351074
start getting new_q
get new_q:  0.0702660083770752
train start:
critic loss [0.024292741, 0.023219079]
critic loss [0.024037564, 0.022836981]
critic loss [0.023674535, 0.022354277]
critic loss [0.02319793, 0.021786414]
critic loss [0.022612724, 0.021157652]
actor loss 39.3579
train end 0.5778841972351074
should be 1.0, predicted: [ 0.93947566], target predicted: [ 0.65640253]
should be 1.0, predict

critic loss [0.020693062, 0.018564612]
actor loss 17.4906
train end 0.5222632884979248
should be -1.0, predicted: [-1.15257442], target predicted: [-0.91315144]
should be 1.0, predicted: [ 0.93532538], target predicted: [ 0.64182651]
should be 1.0, predicted: [ 0.81024665], target predicted: [ 0.51718169]
start searching new action
get new action:  0.23504090309143066
start getting new_q
get new_q:  0.0626986026763916
train start:
critic loss [0.066098176, 0.017211571]
critic loss [0.063300855, 0.016762294]
critic loss [0.058304317, 0.016032282]
critic loss [0.051924519, 0.0151356]
critic loss [0.044943728, 0.014181815]
actor loss 48.8707
train end 0.5362544059753418
should be 1.0, predicted: [ 0.45147377], target predicted: [ 0.45074999]
should be 1.0, predicted: [ 1.1292274], target predicted: [ 0.82410651]
should be -1.0, predicted: [-0.66490078], target predicted: [-0.45118457]
should be 1.0, predicted: [ 1.26859939], target predicted: [ 0.91556597]
Episode : 522 Replay Buffer 5000

get new action:  0.22987103462219238
start getting new_q
get new_q:  0.06091737747192383
train start:
critic loss [0.01974912, 0.016072892]
critic loss [0.019476322, 0.015849411]
critic loss [0.018995559, 0.01561373]
critic loss [0.018415403, 0.015386421]
critic loss [0.017808042, 0.015181394]
actor loss 42.2092
train end 0.5286438465118408
should be 1.0, predicted: [ 1.04996395], target predicted: [ 1.02927673]
should be 1.0, predicted: [ 0.55666506], target predicted: [ 0.73634928]
should be 1.0, predicted: [ 0.97361368], target predicted: [ 0.9444499]
start searching new action
get new action:  0.21918964385986328
start getting new_q
get new_q:  0.0612797737121582
train start:
critic loss [0.054081038, 0.026284005]
critic loss [0.053483006, 0.026276629]
critic loss [0.050526582, 0.025793873]
critic loss [0.045915015, 0.024970446]
critic loss [0.040527768, 0.023990834]
actor loss 11.4682
train end 0.5081148147583008
should be 1.0, predicted: [-0.19048837], target predicted: [ 0.52794

get new action:  0.22901630401611328
start getting new_q
get new_q:  0.06770706176757812
train start:
critic loss [0.030168261, 0.025818981]
critic loss [0.029651504, 0.026111815]
critic loss [0.029087983, 0.026317621]
critic loss [0.028491737, 0.026390195]
critic loss [0.027865134, 0.026306514]
actor loss 31.4303
train end 0.5330631732940674
should be 1.0, predicted: [ 0.86552113], target predicted: [ 0.64184064]
should be 1.0, predicted: [ 0.83842033], target predicted: [ 0.84513491]
should be 1.0, predicted: [ 0.96848983], target predicted: [ 0.78634876]
start searching new action
get new action:  0.2330470085144043
start getting new_q
get new_q:  0.06396055221557617
train start:
critic loss [0.019647783, 0.01859013]
critic loss [0.019279245, 0.018366843]
critic loss [0.018764198, 0.017984055]
critic loss [0.018137366, 0.017495709]
critic loss [0.017449414, 0.016956612]
actor loss 88.5587
train end 0.6346495151519775
should be 1.0, predicted: [ 1.09798837], target predicted: [ 0.222

critic loss [0.028663568, 0.028337087]
critic loss [0.028180325, 0.028058402]
critic loss [0.027282774, 0.027264226]
critic loss [0.026114017, 0.026110869]
actor loss 19.0844
train end 0.6050224304199219
should be -1.0, predicted: [-0.99462831], target predicted: [-0.95985109]
start searching new action
get new action:  0.23807430267333984
start getting new_q
get new_q:  0.07877945899963379
train start:
critic loss [0.083525345, 0.025447892]
critic loss [0.078294568, 0.02433211]
critic loss [0.070384726, 0.023167046]
critic loss [0.061539561, 0.022232462]
critic loss [0.052783247, 0.021632746]
actor loss 75.5084
train end 0.7043838500976562
should be 1.0, predicted: [ 0.88747245], target predicted: [ 0.77595276]
should be 1.0, predicted: [ 1.08265507], target predicted: [ 1.01258612]
should be 1.0, predicted: [ 0.87914342], target predicted: [ 0.9064098]
should be 1.0, predicted: [ 0.97435111], target predicted: [ 0.8738001]
should be 1.0, predicted: [ 0.92541474], target predicted: [ 

Episode : 529 Replay Buffer 50000
TOTAL REWARD @ 529-th Episode  : Reward -1
Total Step: 129
start searching new action
get new action:  0.2501792907714844
start getting new_q
get new_q:  0.06503152847290039
train start:
critic loss [0.037073225, 0.023863317]
critic loss [0.035876177, 0.024014873]
critic loss [0.033074655, 0.023594847]
critic loss [0.029527066, 0.022733135]
critic loss [0.026035335, 0.021588121]
actor loss 63.4451
train end 0.5301837921142578
should be -1.0, predicted: [-0.86633712], target predicted: [-0.94529742]
should be 1.0, predicted: [ 0.97122431], target predicted: [ 0.86426258]
should be 1.0, predicted: [ 1.02051139], target predicted: [ 1.08270085]
should be -1.0, predicted: [-1.15703833], target predicted: [-1.13868427]
start searching new action
get new action:  0.23236536979675293
start getting new_q
get new_q:  0.06622719764709473
train start:
critic loss [0.021225775, 0.020274911]
critic loss [0.020432916, 0.019783445]
critic loss [0.019768901, 0.0193038

critic loss [0.029797161, 0.029797161]
critic loss [0.029562004, 0.029562004]
critic loss [0.028929379, 0.028929379]
critic loss [0.027939312, 0.027939312]
actor loss 43.2522
train end 0.5815412998199463
start searching new action
get new action:  0.23824357986450195
start getting new_q
get new_q:  0.06599903106689453
train start:
critic loss [0.033533156, 0.033533156]
critic loss [0.032096494, 0.032096494]
critic loss [0.029988121, 0.029988121]
critic loss [0.027510084, 0.027510084]
critic loss [0.024941146, 0.024941146]
actor loss 56.0301
train end 0.5387873649597168
start searching new action
get new action:  0.22793936729431152
start getting new_q
get new_q:  0.07697057723999023
train start:
critic loss [0.028591778, 0.022712698]
critic loss [0.027291071, 0.021920642]
critic loss [0.025751801, 0.021242032]
critic loss [0.024069235, 0.020627934]
critic loss [0.022373168, 0.020037858]
actor loss 57.6887
train end 0.5280239582061768
should be 1.0, predicted: [ 0.96650732], target pred

critic loss [0.015739128, 0.013413768]
actor loss 47.7479
train end 0.5930299758911133
should be 1.0, predicted: [ 1.04027128], target predicted: [ 0.98292887]
should be 1.0, predicted: [ 0.8540073], target predicted: [ 0.9884091]
should be 1.0, predicted: [ 1.15839648], target predicted: [ 0.85624528]
should be 1.0, predicted: [ 0.85156077], target predicted: [ 0.73270011]
start searching new action
get new action:  0.25809407234191895
start getting new_q
get new_q:  0.07590866088867188
train start:
critic loss [0.017299814, 0.017299814]
critic loss [0.017390694, 0.017390694]
critic loss [0.017365512, 0.017365512]
critic loss [0.017198313, 0.017198313]
critic loss [0.016880928, 0.016880928]
actor loss 69.4329
train end 0.6980383396148682
start searching new action
get new action:  0.2489781379699707
start getting new_q
get new_q:  0.06865882873535156
train start:
critic loss [0.01873941, 0.017146453]
critic loss [0.018109402, 0.016751776]
critic loss [0.017283143, 0.016265336]
critic 

critic loss [0.032853112, 0.020843405]
actor loss 94.7014
train end 0.5719220638275146
should be 1.0, predicted: [ 1.08556545], target predicted: [ 0.70512396]
should be 1.0, predicted: [ 0.89921331], target predicted: [ 0.30777732]
should be 1.0, predicted: [ 1.01016247], target predicted: [ 0.75159103]
should be 1.0, predicted: [ 0.96355307], target predicted: [ 0.77920175]
should be 1.0, predicted: [ 1.03452456], target predicted: [ 0.9632892]
start searching new action
get new action:  0.23392581939697266
start getting new_q
get new_q:  0.07014632225036621
train start:
critic loss [0.057382859, 0.02209574]
critic loss [0.048522152, 0.020032298]
critic loss [0.039017953, 0.018176695]
critic loss [0.030544572, 0.016854592]
critic loss [0.024343308, 0.016238321]
actor loss 78.48
train end 0.602647066116333
should be 1.0, predicted: [ 1.08421481], target predicted: [ 0.34442922]
should be -1.0, predicted: [-0.61109197], target predicted: [-1.10791731]
should be 1.0, predicted: [ 1.2690

Episode : 536 Replay Buffer 50000
TOTAL REWARD @ 536-th Episode  : Reward 1
Total Step: 140
start searching new action
get new action:  0.2490220069885254
start getting new_q
get new_q:  0.06903076171875
train start:
critic loss [0.01955962, 0.016924009]
critic loss [0.019636068, 0.016955491]
critic loss [0.019461585, 0.016941097]
critic loss [0.019028999, 0.016842719]
critic loss [0.018383043, 0.016645771]
actor loss 21.4446
train end 0.5720257759094238
should be 1.0, predicted: [ 0.77844048], target predicted: [ 0.6767953]
should be -1.0, predicted: [-0.83434689], target predicted: [-0.96807873]
should be 1.0, predicted: [ 1.00965595], target predicted: [ 0.83776712]
start searching new action
get new action:  0.27654528617858887
start getting new_q
get new_q:  0.06707358360290527
train start:
critic loss [0.021647044, 0.015565276]
critic loss [0.02127254, 0.015446238]
critic loss [0.020418826, 0.015257805]
critic loss [0.019289214, 0.015020914]
critic loss [0.01806407, 0.014756878]


get new action:  0.2297072410583496
start getting new_q
get new_q:  0.06736350059509277
train start:
critic loss [0.042974919, 0.020909872]
critic loss [0.043175913, 0.020880014]
critic loss [0.042060781, 0.020703789]
critic loss [0.039802827, 0.02032613]
critic loss [0.036705457, 0.0197496]
actor loss 66.6113
train end 0.521104097366333
should be 1.0, predicted: [ 0.8770076], target predicted: [ 0.85698712]
should be 1.0, predicted: [ 0.88002598], target predicted: [ 0.70231473]
should be 1.0, predicted: [ 0.77653474], target predicted: [ 0.80807412]
should be -1.0, predicted: [-0.93439174], target predicted: [-1.14876962]
should be 1.0, predicted: [ 0.65582764], target predicted: [ 0.38350904]
should be -1.0, predicted: [-0.45515734], target predicted: [-0.76307654]
start searching new action
get new action:  0.23966765403747559
start getting new_q
get new_q:  0.06238102912902832
train start:
critic loss [0.032740902, 0.024912708]
critic loss [0.032459162, 0.02428632]
critic loss [0.

critic loss [0.014795233, 0.014678529]
critic loss [0.015008396, 0.014912423]
critic loss [0.015231561, 0.015123239]
critic loss [0.01538507, 0.015253114]
actor loss 38.3286
train end 0.6402890682220459
should be -1.0, predicted: [-1.04998446], target predicted: [-0.87034774]
should be 1.0, predicted: [ 1.16405272], target predicted: [ 0.90029067]
start searching new action
get new action:  0.2790067195892334
start getting new_q
get new_q:  0.07717275619506836
train start:
critic loss [0.030654715, 0.021247374]
critic loss [0.030242883, 0.021249605]
critic loss [0.029164288, 0.021031395]
critic loss [0.027607461, 0.020644564]
critic loss [0.025776038, 0.020149002]
actor loss 51.5865
train end 0.7050375938415527
should be 1.0, predicted: [ 0.75411528], target predicted: [ 0.60942358]
start searching new action
get new action:  0.23743271827697754
start getting new_q
get new_q:  0.06603455543518066
train start:
critic loss [0.017608112, 0.015702618]
critic loss [0.01743288, 0.015788957]


critic loss [0.019919135, 0.019919135]
critic loss [0.020804927, 0.020804927]
actor loss 32.5977
train end 0.6932668685913086
start searching new action
get new action:  0.22925591468811035
start getting new_q
get new_q:  0.06708335876464844
train start:
critic loss [0.026720617, 0.023653621]
critic loss [0.026662312, 0.023595741]
critic loss [0.025885174, 0.023091896]
critic loss [0.024579626, 0.022267684]
critic loss [0.022778923, 0.021158166]
actor loss 65.2866
train end 0.5976784229278564
should be 1.0, predicted: [ 1.0010165], target predicted: [ 0.93140006]
should be 1.0, predicted: [ 0.75193638], target predicted: [ 0.92407322]
should be 1.0, predicted: [ 0.7950173], target predicted: [ 0.92826396]
should be 1.0, predicted: [ 0.86205822], target predicted: [ 0.5696609]
start searching new action
get new action:  0.23120427131652832
start getting new_q
get new_q:  0.06791949272155762
train start:
critic loss [0.10692935, 0.036896557]
critic loss [0.097053029, 0.035389215]
critic 

get new action:  0.2431347370147705
start getting new_q
get new_q:  0.06966352462768555
train start:
critic loss [0.046046954, 0.022375537]
critic loss [0.044283852, 0.022002414]
critic loss [0.04072525, 0.021442924]
critic loss [0.036198117, 0.020814752]
critic loss [0.03158449, 0.020232024]
actor loss 3.53017
train end 0.7448039054870605
should be 1.0, predicted: [ 0.77281785], target predicted: [ 0.93190068]
should be -1.0, predicted: [-0.98196387], target predicted: [-0.79990983]
should be 1.0, predicted: [ 0.81123221], target predicted: [ 0.88408566]
should be 1.0, predicted: [ 0.3975718], target predicted: [ 0.46977976]
should be -1.0, predicted: [-0.92240584], target predicted: [-0.70475113]
start searching new action
get new action:  0.24875617027282715
start getting new_q
get new_q:  0.09125137329101562
train start:
critic loss [0.019169204, 0.018627658]
critic loss [0.019379325, 0.01880585]
critic loss [0.019501362, 0.01892522]
critic loss [0.019518029, 0.018966228]
critic lo

actor loss 11.4692
train end 0.6398441791534424
should be 1.0, predicted: [ 0.60771525], target predicted: [ 0.45790488]
should be 1.0, predicted: [ 1.11431146], target predicted: [ 1.07640219]
Episode : 545 Replay Buffer 50000
TOTAL REWARD @ 545-th Episode  : Reward -1
Total Step: 137
start searching new action
get new action:  0.23514294624328613
start getting new_q
get new_q:  0.07202649116516113
train start:
critic loss [0.041615542, 0.025393771]
critic loss [0.039727215, 0.025112228]
critic loss [0.037281428, 0.024728447]
critic loss [0.034631148, 0.024289165]
critic loss [0.03191293, 0.023813773]
actor loss 42.6072
train end 0.5603413581848145
should be 1.0, predicted: [ 0.64328164], target predicted: [ 0.61548358]
should be 1.0, predicted: [ 0.86587483], target predicted: [ 0.93386114]
should be 1.0, predicted: [ 1.13931084], target predicted: [ 1.09355557]
start searching new action
get new action:  0.2330176830291748
start getting new_q
get new_q:  0.06371521949768066
train st

get new action:  0.23512625694274902
start getting new_q
get new_q:  0.07278275489807129
train start:
critic loss [0.035267882, 0.015497487]
critic loss [0.036888059, 0.01555738]
critic loss [0.035180394, 0.015131134]
critic loss [0.030781651, 0.014271314]
critic loss [0.024989191, 0.013196913]
actor loss 5.02449
train end 0.673393964767456
should be -1.0, predicted: [-0.96661699], target predicted: [-0.87520742]
should be 1.0, predicted: [ 0.23484072], target predicted: [ 0.62863803]
should be 1.0, predicted: [ 0.66505551], target predicted: [ 1.02897906]
Episode : 547 Replay Buffer 50000
TOTAL REWARD @ 547-th Episode  : Reward -1
Total Step: 106
start searching new action
get new action:  0.24054288864135742
start getting new_q
get new_q:  0.0744020938873291
train start:
critic loss [0.04557275, 0.021333475]
critic loss [0.041290533, 0.021347675]
critic loss [0.035849478, 0.02103355]
critic loss [0.030110033, 0.020523328]
critic loss [0.02562755, 0.020021591]
actor loss -5.08186
trai

get new action:  0.2359325885772705
start getting new_q
get new_q:  0.06412529945373535
train start:
critic loss [0.074596174, 0.023443174]
critic loss [0.069418825, 0.021920159]
critic loss [0.062254287, 0.02060757]
critic loss [0.054035082, 0.019604791]
critic loss [0.045406148, 0.01882969]
actor loss 39.6188
train end 0.5168309211730957
should be -1.0, predicted: [-0.66451728], target predicted: [-0.02061211]
should be 1.0, predicted: [ 0.43509161], target predicted: [ 0.19229569]
should be 1.0, predicted: [ 1.16111338], target predicted: [ 1.12430644]
start searching new action
get new action:  0.23267459869384766
start getting new_q
get new_q:  0.06690287590026855
train start:
critic loss [0.12384018, 0.033923365]
critic loss [0.1165968, 0.033263076]
critic loss [0.10513497, 0.032293119]
critic loss [0.091420189, 0.031227579]
critic loss [0.07708434, 0.03020414]
actor loss 33.7451
train end 0.507347822189331
should be 1.0, predicted: [-0.03464821], target predicted: [-0.2716668]
s

get new action:  0.22894740104675293
start getting new_q
get new_q:  0.060933589935302734
train start:
critic loss [0.082909748, 0.032046922]
critic loss [0.079139695, 0.031329762]
critic loss [0.071412385, 0.029981557]
critic loss [0.061432846, 0.028255269]
critic loss [0.051278915, 0.026427358]
actor loss 33.8163
train end 0.5536839962005615
should be -1.0, predicted: [-0.20625921], target predicted: [-0.24050887]
should be 1.0, predicted: [ 0.88921463], target predicted: [ 0.94882894]
should be 1.0, predicted: [ 1.03197026], target predicted: [ 1.02545452]
should be 1.0, predicted: [ 0.88789576], target predicted: [ 0.87379038]
start searching new action
get new action:  0.2310185432434082
start getting new_q
get new_q:  0.07102179527282715
train start:
critic loss [0.023976259, 0.016608631]
critic loss [0.023021026, 0.016378297]
critic loss [0.021801421, 0.016164457]
critic loss [0.020517901, 0.016003564]
critic loss [0.019328594, 0.01589]
actor loss 2.08893
train end 0.52615928649

get new action:  0.2252368927001953
start getting new_q
get new_q:  0.06716370582580566
train start:
critic loss [0.040622555, 0.035931602]
critic loss [0.0423146, 0.037014306]
critic loss [0.042713419, 0.037328478]
critic loss [0.041714869, 0.036853604]
critic loss [0.039596841, 0.035708021]
actor loss 25.0498
train end 0.5616271495819092
should be 1.0, predicted: [ 1.2312386], target predicted: [ 0.88024569]
should be 1.0, predicted: [ 1.43077803], target predicted: [ 0.97728598]
should be 1.0, predicted: [ 1.17818832], target predicted: [ 0.90935987]
should be 1.0, predicted: [ 1.16960216], target predicted: [ 0.89181519]
start searching new action
get new action:  0.22693514823913574
start getting new_q
get new_q:  0.06192779541015625
train start:
critic loss [0.10190737, 0.041376889]
critic loss [0.087577477, 0.038133994]
critic loss [0.067986049, 0.033889215]
critic loss [0.049226582, 0.029697882]
critic loss [0.034744181, 0.026289497]
actor loss -6.52739
train end 0.619520664215

critic loss [0.038128041, 0.02326012]
actor loss 17.9637
train end 0.5548052787780762
should be 1.0, predicted: [-0.06604227], target predicted: [ 0.52487743]
start searching new action
get new action:  0.24004817008972168
start getting new_q
get new_q:  0.07430744171142578
train start:
critic loss [0.045923218, 0.043631706]
critic loss [0.045382671, 0.043678515]
critic loss [0.044282056, 0.043232858]
critic loss [0.042882986, 0.042397555]
critic loss [0.04142084, 0.041298606]
actor loss -19.1131
train end 0.5249888896942139
should be -1.0, predicted: [-1.00911224], target predicted: [-1.06607735]
start searching new action
get new action:  0.22601604461669922
start getting new_q
get new_q:  0.06334137916564941
train start:
critic loss [0.023619093, 0.016830549]
critic loss [0.0230317, 0.016163485]
critic loss [0.021948468, 0.01547884]
critic loss [0.020553024, 0.014846439]
critic loss [0.019053608, 0.014329037]
actor loss 37.8923
train end 0.5350799560546875
should be 1.0, predicted: 

critic loss [0.028000226, 0.020999109]
critic loss [0.027195875, 0.020453051]
critic loss [0.026085325, 0.019908506]
critic loss [0.024745783, 0.019359443]
actor loss 26.0585
train end 0.5219554901123047
should be -1.0, predicted: [-0.78675771], target predicted: [-0.69371456]
should be 1.0, predicted: [ 0.66854966], target predicted: [ 0.62069267]
start searching new action
get new action:  0.2338244915008545
start getting new_q
get new_q:  0.07507729530334473
train start:
critic loss [0.030239007, 0.027567167]
critic loss [0.029393964, 0.027103031]
critic loss [0.028324341, 0.026515432]
critic loss [0.027157357, 0.025855865]
critic loss [0.026005169, 0.025171705]
actor loss 63.2628
train end 0.6179027557373047
should be 1.0, predicted: [ 1.00981903], target predicted: [ 0.87963969]
start searching new action
get new action:  0.257080078125
start getting new_q
get new_q:  0.08672809600830078
train start:
critic loss [0.050212275, 0.026281366]
critic loss [0.048177999, 0.026578927]
cri

Episode : 558 Replay Buffer 50000
TOTAL REWARD @ 558-th Episode  : Reward -1
Total Step: 101
start searching new action
get new action:  0.2347729206085205
start getting new_q
get new_q:  0.06796765327453613
train start:
critic loss [0.022271149, 0.020649474]
critic loss [0.021921922, 0.020757429]
critic loss [0.021705685, 0.020847496]
critic loss [0.02158279, 0.020895664]
critic loss [0.021485742, 0.020874642]
actor loss 23.8891
train end 0.9676308631896973
should be 1.0, predicted: [ 1.088346], target predicted: [ 1.02195668]
should be -1.0, predicted: [-1.1705606], target predicted: [-1.24678934]
should be 1.0, predicted: [ 0.95047653], target predicted: [ 0.97183007]
start searching new action
get new action:  0.25636816024780273
start getting new_q
get new_q:  0.06629014015197754
train start:
critic loss [0.039164066, 0.026166717]
critic loss [0.038064752, 0.025897279]
critic loss [0.035858709, 0.025370877]
critic loss [0.033040963, 0.024698991]
critic loss [0.03024587, 0.02400524

critic loss [0.02733483, 0.019554645]
critic loss [0.025637042, 0.01902682]
critic loss [0.023615332, 0.01850719]
critic loss [0.021537155, 0.018034602]
actor loss 29.9628
train end 0.5207037925720215
should be 1.0, predicted: [ 0.77449739], target predicted: [ 0.53975618]
trained action prob map predicted by initial model for a starting game
[[[ 0.37368605  0.00378242  0.0037862   0.00378701  0.00377834  0.00378329
    0.00378326]
  [ 0.00378226  0.00378492  0.00378351  0.00378161  0.00378577  0.00378481
    0.00378277]
  [ 0.0037845   0.00378625  0.00378403  0.00378586  0.0037847   0.00378754
    0.00378538]
  [ 0.00378377  0.00378506  0.00378615  0.00378812  0.00378328  0.00378467
    0.00378693]
  [ 0.00378564  0.00378713  0.00377925  0.00378766  0.00378592  0.00378386
    0.00378011]
  [ 0.00378174  0.00378473  0.00378568  0.00378363  0.00378808  0.00378468
    0.00378409]
  [ 0.00378402  0.00378494  0.00378204  0.00378517  0.0037844   0.00378283
    0.44844592]]

 [[ 0.00171679  

critic loss [0.044485878, 0.023304572]
critic loss [0.040835604, 0.024081029]
critic loss [0.037289359, 0.024554553]
critic loss [0.034177046, 0.024716025]
actor loss 13.786
train end 0.9735920429229736
should be -1.0, predicted: [-0.77126974], target predicted: [-0.72025579]
should be 1.0, predicted: [ 0.62760097], target predicted: [ 0.08240141]
start searching new action
get new action:  0.2551126480102539
start getting new_q
get new_q:  0.07101607322692871
train start:
critic loss [0.040292054, 0.026842393]
critic loss [0.038720034, 0.026600823]
critic loss [0.036124043, 0.025954884]
critic loss [0.032979928, 0.02507522]
critic loss [0.029753368, 0.024131]
actor loss 51.9263
train end 0.6883201599121094
should be 1.0, predicted: [ 1.16744709], target predicted: [ 0.8358053]
should be -1.0, predicted: [-0.68350643], target predicted: [-0.49818766]
start searching new action
get new action:  0.24955296516418457
start getting new_q
get new_q:  0.06564569473266602
train start:
critic l

get new action:  0.21798992156982422
start getting new_q
get new_q:  0.06470870971679688
train start:
critic loss [0.01628329, 0.01628329]
critic loss [0.016660672, 0.016660672]
critic loss [0.017053837, 0.017053837]
critic loss [0.017324116, 0.017324116]
critic loss [0.017384585, 0.017384585]
actor loss 44.7052
train end 0.5202984809875488
start searching new action
get new action:  0.21804594993591309
start getting new_q
get new_q:  0.06201577186584473
train start:
critic loss [0.031163992, 0.019974805]
critic loss [0.029433645, 0.019606151]
critic loss [0.026936829, 0.019073457]
critic loss [0.024211964, 0.018492639]
critic loss [0.0218214, 0.017974539]
actor loss 38.4862
train end 0.5131576061248779
should be 1.0, predicted: [ 1.10285449], target predicted: [ 0.97668368]
should be 1.0, predicted: [ 0.85652059], target predicted: [ 0.29833743]
should be -1.0, predicted: [-0.93139535], target predicted: [-1.15200472]
start searching new action
get new action:  0.24350523948669434
sta

get new action:  0.222062349319458
start getting new_q
get new_q:  0.06585860252380371
train start:
critic loss [0.018801061, 0.014820131]
critic loss [0.018136621, 0.014788886]
critic loss [0.017420396, 0.014769523]
critic loss [0.016721455, 0.014749909]
critic loss [0.0160871, 0.01471173]
actor loss 45.2956
train end 0.5297446250915527
should be 1.0, predicted: [ 0.79934287], target predicted: [ 0.87894177]
should be 1.0, predicted: [ 1.01197505], target predicted: [ 0.96743822]
should be 1.0, predicted: [ 0.97186613], target predicted: [ 0.8489821]
start searching new action
get new action:  0.23511123657226562
start getting new_q
get new_q:  0.0659940242767334
train start:
critic loss [0.034600243, 0.027958428]
critic loss [0.033946875, 0.027642474]
critic loss [0.03192509, 0.026423575]
critic loss [0.029006258, 0.024602063]
critic loss [0.025716569, 0.022517305]
actor loss 21.2719
train end 0.5353283882141113
should be -1.0, predicted: [-0.96488774], target predicted: [-0.88763744

get new action:  0.23580718040466309
start getting new_q
get new_q:  0.06887412071228027
train start:
critic loss [0.03148596, 0.017343098]
critic loss [0.029414494, 0.017336717]
critic loss [0.026492536, 0.017163051]
critic loss [0.023396768, 0.016928725]
critic loss [0.020653777, 0.016732987]
actor loss 66.7986
train end 0.6232237815856934
should be 1.0, predicted: [ 0.97666836], target predicted: [ 0.77143431]
should be 1.0, predicted: [ 0.85816574], target predicted: [ 0.54231369]
should be 1.0, predicted: [ 0.86678654], target predicted: [ 0.94783759]
should be 1.0, predicted: [ 0.84461534], target predicted: [ 0.8801235]
start searching new action
get new action:  0.2476813793182373
start getting new_q
get new_q:  0.07596206665039062
train start:
critic loss [0.032900654, 0.025806304]
critic loss [0.03264042, 0.025906054]
critic loss [0.031660032, 0.025713693]
critic loss [0.030118575, 0.025255661]
critic loss [0.028238829, 0.02458363]
actor loss 41.7519
train end 0.6137788295745

get new action:  0.24492359161376953
start getting new_q
get new_q:  0.06975698471069336
train start:
critic loss [0.020423478, 0.016263586]
critic loss [0.019601509, 0.015900884]
critic loss [0.018641079, 0.015523309]
critic loss [0.017653022, 0.0151669]
critic loss [0.016728073, 0.014856616]
actor loss 37.0887
train end 0.6007905006408691
should be 1.0, predicted: [ 0.93293387], target predicted: [ 0.86779457]
should be 1.0, predicted: [ 0.76336801], target predicted: [ 0.84651262]
start searching new action
get new action:  0.25752854347229004
start getting new_q
get new_q:  0.07711315155029297
train start:
critic loss [0.018104047, 0.015718659]
critic loss [0.018154332, 0.015646832]
critic loss [0.017960329, 0.015519477]
critic loss [0.017517488, 0.015318249]
critic loss [0.016879495, 0.015041475]
actor loss 67.383
train end 0.7516450881958008
should be 1.0, predicted: [ 0.92890555], target predicted: [ 1.12373471]
should be 1.0, predicted: [ 1.16736794], target predicted: [ 1.3552

critic loss [0.024892893, 0.023366433]
actor loss 71.3943
train end 0.5469846725463867
should be 1.0, predicted: [ 1.08626974], target predicted: [ 1.0604105]
should be 1.0, predicted: [ 1.06509805], target predicted: [ 1.09811628]
start searching new action
get new action:  0.2294909954071045
start getting new_q
get new_q:  0.06859087944030762
train start:
critic loss [0.032545373, 0.025185455]
critic loss [0.031676941, 0.025364267]
critic loss [0.030694595, 0.025446612]
critic loss [0.029677687, 0.025415534]
critic loss [0.028686693, 0.025266573]
actor loss 46.338
train end 0.5317611694335938
should be 1.0, predicted: [ 0.75965357], target predicted: [ 0.62431276]
should be -1.0, predicted: [-0.67103487], target predicted: [-1.05109274]
should be 1.0, predicted: [ 1.02531803], target predicted: [ 0.97835112]
start searching new action
get new action:  0.23123955726623535
start getting new_q
get new_q:  0.07362103462219238
train start:
critic loss [0.03356988, 0.031343732]
critic loss

get new action:  0.2468869686126709
start getting new_q
get new_q:  0.06482076644897461
train start:
critic loss [0.025506727, 0.025040712]
critic loss [0.025491569, 0.025032049]
critic loss [0.025051346, 0.024671966]
critic loss [0.024280027, 0.024021115]
critic loss [0.023282051, 0.023144346]
actor loss 11.1096
train end 0.5677685737609863
should be -1.0, predicted: [-1.08416975], target predicted: [-0.65421945]
start searching new action
get new action:  0.27909064292907715
start getting new_q
get new_q:  0.07593274116516113
train start:
critic loss [0.014801132, 0.014800075]
critic loss [0.014430266, 0.014412674]
critic loss [0.014043342, 0.013990922]
critic loss [0.01363969, 0.013539243]
critic loss [0.013206243, 0.013051638]
actor loss 33.3133
train end 0.6123471260070801
should be 1.0, predicted: [ 0.81900561], target predicted: [ 0.92749894]
start searching new action
get new action:  0.24860072135925293
start getting new_q
get new_q:  0.06929469108581543
train start:
critic lo

critic loss [0.026797557, 0.024077848]
actor loss 33.9067
train end 0.57342529296875
should be 1.0, predicted: [ 1.0437932], target predicted: [ 1.0073365]
should be 1.0, predicted: [ 1.3367393], target predicted: [ 0.81399679]
start searching new action
get new action:  0.22851037979125977
start getting new_q
get new_q:  0.07623553276062012
train start:
critic loss [0.085146576, 0.047072172]
critic loss [0.078836694, 0.043991908]
critic loss [0.06901554, 0.039967678]
critic loss [0.057731684, 0.035610724]
critic loss [0.046907708, 0.031466864]
actor loss 20.7974
train end 0.5581076145172119
should be -1.0, predicted: [-0.52373201], target predicted: [-0.81406754]
start searching new action
get new action:  0.2322533130645752
start getting new_q
get new_q:  0.06799197196960449
train start:
critic loss [0.14491661, 0.042158663]
critic loss [0.1230993, 0.039176054]
critic loss [0.099770993, 0.036195002]
critic loss [0.078296982, 0.033682261]
critic loss [0.061895661, 0.031982914]
actor l

get new action:  0.23393845558166504
start getting new_q
get new_q:  0.07364273071289062
train start:
critic loss [0.024181686, 0.023501985]
critic loss [0.024626326, 0.023778908]
critic loss [0.024650002, 0.023741508]
critic loss [0.024230171, 0.023365669]
critic loss [0.023426188, 0.022676414]
actor loss -7.28754
train end 0.6389412879943848
should be -1.0, predicted: [-0.84045768], target predicted: [-0.90858203]
start searching new action
get new action:  0.22626805305480957
start getting new_q
get new_q:  0.06580448150634766
train start:
critic loss [0.082234479, 0.03589724]
critic loss [0.079602353, 0.035549358]
critic loss [0.07516408, 0.034949049]
critic loss [0.069516838, 0.034136273]
critic loss [0.063156061, 0.033159092]
actor loss 43.7795
train end 0.6164367198944092
should be 1.0, predicted: [ 0.41422668], target predicted: [ 0.19333482]
start searching new action
get new action:  0.23002123832702637
start getting new_q
get new_q:  0.060283660888671875
train start:
critic 

critic loss [0.058848336, 0.023774169]
critic loss [0.052865632, 0.02268659]
critic loss [0.045628816, 0.021290403]
critic loss [0.038448147, 0.019840088]
actor loss 8.88054
train end 0.521043062210083
should be 1.0, predicted: [ 0.74721122], target predicted: [ 0.71307737]
should be -1.0, predicted: [-1.46649516], target predicted: [-1.39679682]
should be -1.0, predicted: [-0.87666863], target predicted: [-0.10829614]
should be -1.0, predicted: [-0.9895255], target predicted: [-0.68321663]
start searching new action
get new action:  0.24794673919677734
start getting new_q
get new_q:  0.07003450393676758
train start:
critic loss [0.036653332, 0.020909363]
critic loss [0.034030546, 0.020366615]
critic loss [0.030047625, 0.019445494]
critic loss [0.026108364, 0.018478634]
critic loss [0.023238538, 0.017707482]
actor loss 64.6329
train end 0.5101377964019775
should be 1.0, predicted: [ 0.86296809], target predicted: [ 0.82564569]
should be -1.0, predicted: [-1.26152062], target predicted:

critic loss [0.016793717, 0.016526751]
actor loss 37.9719
train end 0.49481797218322754
should be 1.0, predicted: [ 0.94541979], target predicted: [ 0.87933463]
should be 1.0, predicted: [ 1.14949548], target predicted: [ 0.67372686]
start searching new action
get new action:  0.22985315322875977
start getting new_q
get new_q:  0.06743001937866211
train start:
critic loss [0.044607375, 0.023713803]
critic loss [0.044228192, 0.023995394]
critic loss [0.041794911, 0.023570435]
critic loss [0.037849233, 0.022584714]
critic loss [0.033086874, 0.021246182]
actor loss 66.0124
train end 0.5561549663543701
should be 1.0, predicted: [ 1.09290361], target predicted: [ 0.58141875]
should be 1.0, predicted: [ 1.30374825], target predicted: [ 1.07136679]
should be 1.0, predicted: [ 1.21330965], target predicted: [ 1.01284397]
trained action prob map predicted by initial model for a starting game
[[[ 0.41942722  0.00302322  0.00302583  0.00302683  0.00302079  0.00302376
    0.00302357]
  [ 0.003023 

get new action:  0.23395562171936035
start getting new_q
get new_q:  0.07080721855163574
train start:
critic loss [0.071922049, 0.025552342]
critic loss [0.066053778, 0.024905756]
critic loss [0.058708631, 0.024341989]
critic loss [0.051171727, 0.024002401]
critic loss [0.044401091, 0.02394972]
actor loss 22.907
train end 0.5123310089111328
should be -1.0, predicted: [-0.75250977], target predicted: [-0.27398029]
should be 1.0, predicted: [ 0.98564619], target predicted: [ 1.01609492]
should be 1.0, predicted: [ 0.87973583], target predicted: [ 0.93403125]
should be 1.0, predicted: [ 0.8141734], target predicted: [ 0.91189754]
should be -1.0, predicted: [-1.25367033], target predicted: [-1.00551832]
should be -1.0, predicted: [-0.66897619], target predicted: [-0.46160239]
start searching new action
get new action:  0.22487902641296387
start getting new_q
get new_q:  0.06541061401367188
train start:
critic loss [0.047102004, 0.032714114]
critic loss [0.047455207, 0.033389889]
critic los

get new action:  0.23796653747558594
start getting new_q
get new_q:  0.07834053039550781
train start:
critic loss [0.015391132, 0.015252358]
critic loss [0.015283189, 0.015131611]
critic loss [0.015087817, 0.014941657]
critic loss [0.014819239, 0.014695366]
critic loss [0.014494675, 0.014403933]
actor loss 62.9608
train end 0.6496434211730957
should be 1.0, predicted: [ 0.94552964], target predicted: [ 0.8475551]
start searching new action
get new action:  0.22699737548828125
start getting new_q
get new_q:  0.06843829154968262
train start:
critic loss [0.023111118, 0.01749631]
critic loss [0.022741454, 0.017175226]
critic loss [0.022131018, 0.016812287]
critic loss [0.021343805, 0.016430743]
critic loss [0.02044389, 0.016048044]
actor loss 74.2753
train end 0.5678684711456299
should be 1.0, predicted: [ 0.9777928], target predicted: [ 0.97811824]
should be -1.0, predicted: [-0.6372332], target predicted: [-0.48653549]
should be 1.0, predicted: [ 0.9759028], target predicted: [ 1.065088

get new action:  0.2442009449005127
start getting new_q
get new_q:  0.07156682014465332
train start:
critic loss [0.017796446, 0.016916662]
critic loss [0.017663185, 0.016783135]
critic loss [0.017379105, 0.016531603]
critic loss [0.016965527, 0.01618195]
critic loss [0.016456321, 0.01576202]
actor loss 56.1931
train end 0.5377967357635498
should be -1.0, predicted: [-0.93267018], target predicted: [-0.99365413]
should be 1.0, predicted: [ 1.13085651], target predicted: [ 1.14553821]
should be -1.0, predicted: [-1.01149476], target predicted: [-1.006827]
start searching new action
get new action:  0.23542141914367676
start getting new_q
get new_q:  0.07727622985839844
train start:
critic loss [0.053458907, 0.024390446]
critic loss [0.051406089, 0.024114119]
critic loss [0.047716364, 0.023777224]
critic loss [0.043102399, 0.023443133]
critic loss [0.038324676, 0.023174193]
actor loss 30.3494
train end 0.5237336158752441
should be 1.0, predicted: [ 0.81524086], target predicted: [ 0.7336

critic loss [0.037358075, 0.034823697]
critic loss [0.035557956, 0.03366109]
critic loss [0.033582665, 0.032363757]
critic loss [0.031689502, 0.03104529]
actor loss 19.2018
train end 0.5160341262817383
should be 1.0, predicted: [ 1.01398003], target predicted: [ 1.17035079]
should be 1.0, predicted: [ 1.03991187], target predicted: [ 0.94196957]
should be 1.0, predicted: [ 0.96812773], target predicted: [ 0.99622875]
Episode : 587 Replay Buffer 50000
TOTAL REWARD @ 587-th Episode  : Reward 1
Total Step: 124
start searching new action
get new action:  0.23985743522644043
start getting new_q
get new_q:  0.07598710060119629
train start:
critic loss [0.02405565, 0.01940332]
critic loss [0.02468688, 0.019315813]
critic loss [0.024939239, 0.019203173]
critic loss [0.024710823, 0.019030672]
critic loss [0.024015995, 0.018789567]
actor loss 39.2502
train end 0.6787903308868408
should be 1.0, predicted: [ 0.93134177], target predicted: [ 1.04194868]
should be 1.0, predicted: [ 0.99264055], targ

critic loss [0.023698442, 0.020791784]
actor loss 23.5111
train end 0.5146644115447998
should be 1.0, predicted: [ 1.05316198], target predicted: [ 0.75250328]
should be 1.0, predicted: [ 1.07544315], target predicted: [ 1.06458712]
start searching new action
get new action:  0.24036574363708496
start getting new_q
get new_q:  0.06388521194458008
train start:
critic loss [0.023693698, 0.021293826]
critic loss [0.023021966, 0.020698853]
critic loss [0.021882396, 0.019798879]
critic loss [0.020466577, 0.018732473]
critic loss [0.018976629, 0.017639682]
actor loss 59.2558
train end 0.5376017093658447
should be 1.0, predicted: [ 1.05848122], target predicted: [ 1.04987359]
should be 1.0, predicted: [ 0.9257099], target predicted: [ 1.07031858]
Episode : 589 Replay Buffer 50000
TOTAL REWARD @ 589-th Episode  : Reward 1
Total Step: 131
start searching new action
get new action:  0.25157618522644043
start getting new_q
get new_q:  0.07825922966003418
train start:
critic loss [0.033737011, 0.0

critic loss [0.028183792, 0.028133873]
actor loss 4.43196
train end 0.522960901260376
should be 1.0, predicted: [ 1.05176139], target predicted: [ 1.24564278]
start searching new action
get new action:  0.23090314865112305
start getting new_q
get new_q:  0.0652005672454834
train start:
critic loss [0.033071931, 0.024462972]
critic loss [0.030803243, 0.0225842]
critic loss [0.028395798, 0.020855509]
critic loss [0.026120786, 0.019437667]
critic loss [0.024103925, 0.018403459]
actor loss 44.84
train end 0.5095663070678711
should be -1.0, predicted: [-0.70695359], target predicted: [-0.83742017]
should be 1.0, predicted: [ 0.93377435], target predicted: [ 0.89624697]
should be 1.0, predicted: [ 0.71693552], target predicted: [ 0.76697469]
start searching new action
get new action:  0.23456454277038574
start getting new_q
get new_q:  0.07047128677368164
train start:
critic loss [0.02153267, 0.019916076]
critic loss [0.021054707, 0.019924339]
critic loss [0.020688239, 0.019975945]
critic lo

get new action:  0.2490072250366211
start getting new_q
get new_q:  0.06877470016479492
train start:
critic loss [0.036705431, 0.029772222]
critic loss [0.035309371, 0.029200526]
critic loss [0.033988129, 0.028683221]
critic loss [0.032736015, 0.028252887]
critic loss [0.031620372, 0.027928503]
actor loss 1.67541
train end 0.6991899013519287
should be -1.0, predicted: [-0.65759474], target predicted: [-0.27825636]
should be -1.0, predicted: [-0.98258311], target predicted: [-1.01322424]
start searching new action
get new action:  0.26660799980163574
start getting new_q
get new_q:  0.07164287567138672
train start:
critic loss [0.015200668, 0.014611978]
critic loss [0.015488814, 0.014920388]
critic loss [0.015613961, 0.015090962]
critic loss [0.015554873, 0.015089145]
critic loss [0.015318633, 0.014913743]
actor loss 25.5431
train end 0.5937020778656006
should be 1.0, predicted: [ 1.02061188], target predicted: [ 0.94552189]
should be 1.0, predicted: [ 0.78834456], target predicted: [ 0.

get new action:  0.2472703456878662
start getting new_q
get new_q:  0.06849360466003418
train start:
critic loss [0.080382183, 0.03207618]
critic loss [0.078289524, 0.031386957]
critic loss [0.074631624, 0.030469989]
critic loss [0.069857284, 0.029398367]
critic loss [0.064208485, 0.028228283]
actor loss 5.27652
train end 0.5477371215820312
should be -1.0, predicted: [-0.92585057], target predicted: [-0.93395263]
should be 1.0, predicted: [ 0.37580413], target predicted: [ 0.50875074]
should be -1.0, predicted: [-0.29140413], target predicted: [ 0.03664127]
should be 1.0, predicted: [ 0.75226068], target predicted: [ 0.91507798]
start searching new action
get new action:  0.2307417392730713
start getting new_q
get new_q:  0.07692503929138184
train start:
critic loss [0.044949144, 0.027014472]
critic loss [0.043059397, 0.026088651]
critic loss [0.040421039, 0.025047133]
critic loss [0.037289634, 0.023941537]
critic loss [0.03390006, 0.022812894]
actor loss 30.3065
train end 0.6504139900

get new action:  0.2511119842529297
start getting new_q
get new_q:  0.06817436218261719
train start:
critic loss [0.087611221, 0.041588187]
critic loss [0.082034327, 0.041030496]
critic loss [0.071689084, 0.039082497]
critic loss [0.059777658, 0.036470823]
critic loss [0.049100719, 0.033861302]
actor loss 33.9552
train end 0.5305969715118408
should be 1.0, predicted: [ 0.88978779], target predicted: [ 0.86281872]
should be -1.0, predicted: [-0.64551759], target predicted: [-0.83606309]
should be -1.0, predicted: [-1.13675582], target predicted: [-1.08206224]
should be 1.0, predicted: [ 0.71608436], target predicted: [ 0.73844934]
should be 1.0, predicted: [ 0.79307562], target predicted: [ 0.78026897]
start searching new action
get new action:  0.23073959350585938
start getting new_q
get new_q:  0.06230640411376953
train start:
critic loss [0.036410771, 0.033183504]
critic loss [0.035574127, 0.032025322]
critic loss [0.035070255, 0.031280681]
critic loss [0.034788132, 0.030836182]
crit

critic loss [0.038818792, 0.018530373]
critic loss [0.033630747, 0.018819753]
critic loss [0.029517204, 0.019464234]
critic loss [0.026808238, 0.02035914]
actor loss 65.3671
train end 0.5248434543609619
should be 1.0, predicted: [ 0.46674973], target predicted: [ 0.75846112]
should be 1.0, predicted: [ 0.80247539], target predicted: [ 0.76543337]
should be 1.0, predicted: [ 1.04750967], target predicted: [ 0.95705831]
Episode : 598 Replay Buffer 50000
TOTAL REWARD @ 598-th Episode  : Reward 1
Total Step: 120
start searching new action
get new action:  0.2479405403137207
start getting new_q
get new_q:  0.06769633293151855
train start:
critic loss [0.11050989, 0.046033476]
critic loss [0.11075856, 0.046981841]
critic loss [0.10482661, 0.046138257]
critic loss [0.094529629, 0.043945611]
critic loss [0.0818564, 0.04097547]
actor loss 4.67869
train end 0.5219871997833252
should be 1.0, predicted: [ 0.90982181], target predicted: [ 0.86532634]
should be 1.0, predicted: [ 0.89300948], target 

get new action:  0.23098063468933105
start getting new_q
get new_q:  0.09596467018127441
train start:
critic loss [0.089586198, 0.029837662]
critic loss [0.087867349, 0.029430421]
critic loss [0.083860613, 0.028394116]
critic loss [0.078255229, 0.026972238]
critic loss [0.07175149, 0.025458697]
actor loss 72.9984
train end 0.622591495513916
should be 1.0, predicted: [ 1.05157459], target predicted: [ 0.88379335]
should be 1.0, predicted: [ 0.07301699], target predicted: [-0.26508886]
should be 1.0, predicted: [ 0.97237915], target predicted: [ 0.79204643]
start searching new action
get new action:  0.24219489097595215
start getting new_q
get new_q:  0.08314657211303711
train start:
critic loss [0.024461145, 0.020524282]
critic loss [0.025919519, 0.021430075]
critic loss [0.026653407, 0.02199096]
critic loss [0.026570447, 0.022108298]
critic loss [0.025763262, 0.021776896]
actor loss 8.2901
train end 0.5402307510375977
should be 1.0, predicted: [ 0.93191677], target predicted: [ 0.94045

critic loss [0.11623368, 0.051591568]
critic loss [0.10954835, 0.050290495]
critic loss [0.09759514, 0.048230715]
critic loss [0.081801936, 0.045643657]
critic loss [0.066146553, 0.043003112]
actor loss -102.105
train end 0.7122650146484375
should be 1.0, predicted: [ 0.73202789], target predicted: [ 0.80143809]
should be -1.0, predicted: [-0.56997317], target predicted: [-0.21329144]
should be 1.0, predicted: [ 0.95643127], target predicted: [ 0.93520921]
should be 1.0, predicted: [ 0.77079654], target predicted: [ 0.54067737]
start searching new action
get new action:  0.24736642837524414
start getting new_q
get new_q:  0.06834173202514648
train start:
critic loss [0.042469546, 0.042469546]
critic loss [0.042197913, 0.042197913]
critic loss [0.04189742, 0.04189742]
critic loss [0.04154202, 0.04154202]
critic loss [0.041086737, 0.041086737]
actor loss -102.608
train end 0.6793758869171143
start searching new action
get new action:  0.2541356086730957
start getting new_q
get new_q:  0.

critic loss [0.045976203, 0.031941082]
critic loss [0.042406082, 0.03433653]
actor loss -45.5715
train end 0.6760444641113281
should be 1.0, predicted: [ 1.16685176], target predicted: [ 0.99124599]
should be 1.0, predicted: [ 0.80053586], target predicted: [ 0.5467037]
should be 1.0, predicted: [ 0.76303047], target predicted: [ 0.40485811]
should be 1.0, predicted: [ 0.90129775], target predicted: [ 0.69885528]
start searching new action
get new action:  0.25184106826782227
start getting new_q
get new_q:  0.06507229804992676
train start:
critic loss [0.053967625, 0.04997468]
critic loss [0.055992857, 0.052901201]
critic loss [0.056458354, 0.054560952]
critic loss [0.055845335, 0.055011466]
critic loss [0.054739956, 0.054500226]
actor loss -45.1022
train end 0.6873683929443359
should be 1.0, predicted: [ 0.61585093], target predicted: [ 0.79257637]
should be 1.0, predicted: [ 1.01337421], target predicted: [ 0.42393836]
start searching new action
get new action:  0.25254106521606445
s

Episode : 605 Replay Buffer 50000
TOTAL REWARD @ 605-th Episode  : Reward 1
Total Step: 96
start searching new action
get new action:  0.29314255714416504
start getting new_q
get new_q:  0.10040879249572754
train start:
critic loss [0.02764616, 0.02764616]
critic loss [0.027366672, 0.027366672]
critic loss [0.026990879, 0.026990879]
critic loss [0.026555687, 0.026555687]
critic loss [0.026085764, 0.026085764]
actor loss -36.5292
train end 0.8815600872039795
start searching new action
get new action:  0.2425854206085205
start getting new_q
get new_q:  0.06934261322021484
train start:
critic loss [0.17231065, 0.051484354]
critic loss [0.16373321, 0.050274357]
critic loss [0.14876851, 0.047895722]
critic loss [0.13018174, 0.044841602]
critic loss [0.11038588, 0.041569576]
actor loss -67.7874
train end 0.5880565643310547
should be -1.0, predicted: [-0.39312866], target predicted: [-0.19657296]
should be 1.0, predicted: [ 0.84155506], target predicted: [ 1.14947093]
should be -1.0, predicte

get new action:  0.2661857604980469
start getting new_q
get new_q:  0.08892488479614258
train start:
critic loss [0.02401088, 0.02401088]
critic loss [0.023468688, 0.023468688]
critic loss [0.022905745, 0.022905745]
critic loss [0.022375409, 0.022375409]
critic loss [0.0219163, 0.0219163]
actor loss -39.7424
train end 0.7457015514373779
Episode : 607 Replay Buffer 50000
TOTAL REWARD @ 607-th Episode  : Reward -1
Total Step: 15
start searching new action
get new action:  0.24909043312072754
start getting new_q
get new_q:  0.0694739818572998
train start:
critic loss [0.035271376, 0.028330289]
critic loss [0.034790695, 0.028231442]
critic loss [0.033912167, 0.027967717]
critic loss [0.032710046, 0.02753469]
critic loss [0.031266175, 0.026942825]
actor loss -33.1993
train end 0.5702245235443115
should be -1.0, predicted: [-0.49069476], target predicted: [-0.47641185]
start searching new action
get new action:  0.2374889850616455
start getting new_q
get new_q:  0.06266975402832031
train sta

get new action:  0.25600099563598633
start getting new_q
get new_q:  0.0722496509552002
train start:
critic loss [0.16848545, 0.057292525]
critic loss [0.15850022, 0.054729249]
critic loss [0.13659519, 0.049839433]
critic loss [0.10840425, 0.04369507]
critic loss [0.079874337, 0.037410375]
actor loss -28.7846
train end 0.5426089763641357
should be -1.0, predicted: [-1.36953855], target predicted: [-1.23616803]
should be -1.0, predicted: [-0.48490241], target predicted: [-0.76780248]
should be 1.0, predicted: [ 1.4642266], target predicted: [ 1.27813649]
start searching new action
get new action:  0.2299964427947998
start getting new_q
get new_q:  0.06029200553894043
train start:
critic loss [0.034341238, 0.031674102]
critic loss [0.031597108, 0.030117448]
critic loss [0.02986289, 0.029155919]
critic loss [0.028862657, 0.028591607]
critic loss [0.028341182, 0.028289042]
actor loss -35.6333
train end 0.5426232814788818
should be -1.0, predicted: [-1.08220732], target predicted: [-0.87175

critic loss [0.02502073, 0.02458369]
actor loss -20.6618
train end 0.6427905559539795
should be -1.0, predicted: [-1.08094418], target predicted: [-0.88677442]
start searching new action
get new action:  0.2849118709564209
start getting new_q
get new_q:  0.09123873710632324
train start:
critic loss [0.032502212, 0.022878341]
critic loss [0.032109082, 0.022998331]
critic loss [0.031489369, 0.023066379]
critic loss [0.030688643, 0.02306908]
critic loss [0.029756509, 0.022999678]
actor loss -35.7788
train end 0.6133649349212646
should be -1.0, predicted: [-0.6305185], target predicted: [-0.41717306]
start searching new action
get new action:  0.24498558044433594
start getting new_q
get new_q:  0.061170339584350586
train start:
critic loss [0.035553765, 0.031241478]
critic loss [0.035096586, 0.031183425]
critic loss [0.034131914, 0.03093297]
critic loss [0.032859884, 0.030521661]
critic loss [0.031501818, 0.02998893]
actor loss -37.8618
train end 0.546252965927124
should be 1.0, predicted:

actor loss 13.653
train end 0.7647089958190918
should be -1.0, predicted: [-0.95514065], target predicted: [-0.87891173]
should be 1.0, predicted: [ 0.74297589], target predicted: [ 0.71876132]
should be 1.0, predicted: [ 1.08748293], target predicted: [ 1.07754028]
start searching new action
get new action:  0.2849009037017822
start getting new_q
get new_q:  0.07453274726867676
train start:
critic loss [0.024830928, 0.023540366]
critic loss [0.024658069, 0.023328042]
critic loss [0.024269603, 0.023009978]
critic loss [0.023732163, 0.022631418]
critic loss [0.023121795, 0.02223766]
actor loss -54.8403
train end 0.68778395652771
should be 1.0, predicted: [ 1.14965451], target predicted: [ 0.99170256]
start searching new action
get new action:  0.24892234802246094
start getting new_q
get new_q:  0.06233620643615723
train start:
critic loss [0.027373904, 0.02678591]
critic loss [0.027028978, 0.026528835]
critic loss [0.026618369, 0.026201595]
critic loss [0.026168499, 0.025818512]
critic 

critic loss [0.054693967, 0.036794484]
critic loss [0.052650359, 0.036576532]
critic loss [0.049419932, 0.035703477]
critic loss [0.045392662, 0.034282058]
actor loss -40.9124
train end 0.5488450527191162
should be 1.0, predicted: [ 0.91824645], target predicted: [ 0.87280053]
should be 1.0, predicted: [ 0.57872152], target predicted: [ 0.72928041]
should be 1.0, predicted: [ 0.73670614], target predicted: [ 0.50211918]
should be 1.0, predicted: [ 0.78202283], target predicted: [ 0.84976602]
start searching new action
get new action:  0.23005890846252441
start getting new_q
get new_q:  0.07047390937805176
train start:
critic loss [0.033110157, 0.033110157]
critic loss [0.032270484, 0.032270484]
critic loss [0.031210663, 0.031210663]
critic loss [0.030054722, 0.030054722]
critic loss [0.028919395, 0.028919395]
actor loss -29.1821
train end 0.5503246784210205
start searching new action
get new action:  0.23076486587524414
start getting new_q
get new_q:  0.06477880477905273
train start:
c

get new action:  0.2558553218841553
start getting new_q
get new_q:  0.07794952392578125
train start:
critic loss [0.026850024, 0.026819289]
critic loss [0.026612278, 0.026531037]
critic loss [0.026377238, 0.026224744]
critic loss [0.026140908, 0.025903275]
critic loss [0.025896719, 0.025568932]
actor loss -3.46692
train end 0.6908283233642578
should be -1.0, predicted: [-0.88898993], target predicted: [-0.9541232]
start searching new action
get new action:  0.2570204734802246
start getting new_q
get new_q:  0.08329939842224121
train start:
critic loss [0.021483373, 0.021483373]
critic loss [0.021561645, 0.021561645]
critic loss [0.021539357, 0.021539357]
critic loss [0.021413578, 0.021413578]
critic loss [0.021191072, 0.021191072]
actor loss -33.297
train end 0.6844830513000488
start searching new action
get new action:  0.24807095527648926
start getting new_q
get new_q:  0.06936097145080566
train start:
critic loss [0.023771871, 0.023771871]
critic loss [0.023515882, 0.023515882]
crit

critic loss [0.049816601, 0.037120692]
critic loss [0.048597854, 0.036781821]
critic loss [0.04687367, 0.036373872]
critic loss [0.044850819, 0.035923779]
actor loss -106.351
train end 0.601189374923706
should be 1.0, predicted: [ 0.84660006], target predicted: [ 0.73658025]
should be 1.0, predicted: [ 0.47449103], target predicted: [ 0.83392459]
should be -1.0, predicted: [-0.90883195], target predicted: [-1.05284166]
Episode : 618 Replay Buffer 50000
TOTAL REWARD @ 618-th Episode  : Reward 1
Total Step: 14
start searching new action
get new action:  0.23776698112487793
start getting new_q
get new_q:  0.06890463829040527
train start:
critic loss [0.099613652, 0.035007372]
critic loss [0.095160976, 0.034557603]
critic loss [0.086882159, 0.033764098]
critic loss [0.076185554, 0.032795124]
critic loss [0.065080181, 0.031850405]
actor loss -41.3524
train end 0.5280578136444092
should be 1.0, predicted: [ 0.46543348], target predicted: [ 0.67928851]
should be 1.0, predicted: [ 0.647282], t

start searching new action
get new action:  0.25470900535583496
start getting new_q
get new_q:  0.07782363891601562
train start:
critic loss [0.061505564, 0.035059229]
critic loss [0.059139624, 0.035579871]
critic loss [0.055910982, 0.035917982]
critic loss [0.05223329, 0.03610906]
critic loss [0.04851732, 0.036175158]
actor loss -33.0379
train end 0.614835262298584
should be 1.0, predicted: [ 0.55218917], target predicted: [-0.03887328]
should be 1.0, predicted: [ 0.96630412], target predicted: [ 0.94755274]
start searching new action
get new action:  0.2400062084197998
start getting new_q
get new_q:  0.06920886039733887
train start:
critic loss [0.034615275, 0.02108372]
critic loss [0.032601267, 0.021000497]
critic loss [0.029770892, 0.020803602]
critic loss [0.026779855, 0.020604648]
critic loss [0.024169175, 0.020482995]
actor loss -20.2279
train end 0.6077485084533691
should be 1.0, predicted: [ 0.77130109], target predicted: [ 0.6210317]
should be 1.0, predicted: [ 0.94093919], t

get new action:  0.2409060001373291
start getting new_q
get new_q:  0.08111310005187988
train start:
critic loss [0.036081884, 0.03156618]
critic loss [0.035636086, 0.031558741]
critic loss [0.034906879, 0.031369746]
critic loss [0.033972196, 0.03100281]
critic loss [0.032908432, 0.030466888]
actor loss -33.454
train end 0.7647888660430908
should be -1.0, predicted: [-0.46061525], target predicted: [-0.62510604]
should be 1.0, predicted: [ 1.17121947], target predicted: [ 1.16524994]
should be 1.0, predicted: [ 0.97381431], target predicted: [ 0.95684487]
start searching new action
get new action:  0.2759513854980469
start getting new_q
get new_q:  0.07626080513000488
train start:
critic loss [0.027953481, 0.02670778]
critic loss [0.027421508, 0.026255673]
critic loss [0.026793923, 0.025712499]
critic loss [0.026116505, 0.025112471]
critic loss [0.025418494, 0.024484191]
actor loss -58.8331
train end 0.7063601016998291
should be -1.0, predicted: [-1.06200361], target predicted: [-0.946

Episode : 623 Replay Buffer 50000
TOTAL REWARD @ 623-th Episode  : Reward 1
Total Step: 96
start searching new action
get new action:  0.24764704704284668
start getting new_q
get new_q:  0.07494330406188965
train start:
critic loss [0.032604069, 0.021121532]
critic loss [0.033244181, 0.021420751]
critic loss [0.033034787, 0.021519162]
critic loss [0.032025233, 0.021386035]
critic loss [0.030369928, 0.02104136]
actor loss -5.72083
train end 0.6233997344970703
should be 1.0, predicted: [ 0.56935751], target predicted: [ 0.90387577]
should be 1.0, predicted: [ 0.3199136], target predicted: [ 0.9326455]
start searching new action
get new action:  0.239227294921875
start getting new_q
get new_q:  0.08199310302734375
train start:
critic loss [0.1055114, 0.031508826]
critic loss [0.099141777, 0.030501282]
critic loss [0.089047775, 0.029077871]
critic loss [0.076915659, 0.027512446]
critic loss [0.064466111, 0.026062351]
actor loss -5.3683
train end 0.5700559616088867
should be -1.0, predicted

critic loss [0.041492544, 0.040919032]
actor loss -31.8436
train end 0.5539600849151611
should be 1.0, predicted: [ 0.83547097], target predicted: [ 0.84796238]
should be -1.0, predicted: [-0.69585937], target predicted: [-0.95183194]
start searching new action
get new action:  0.23916292190551758
start getting new_q
get new_q:  0.06608438491821289
train start:
critic loss [0.030913733, 0.030006057]
critic loss [0.030459421, 0.029641856]
critic loss [0.029966731, 0.029273689]
critic loss [0.029484473, 0.028931357]
critic loss [0.029031359, 0.02861882]
actor loss -16.9711
train end 0.5180330276489258
should be 1.0, predicted: [ 1.07096708], target predicted: [ 0.90824014]
should be -1.0, predicted: [-1.06245458], target predicted: [-0.95090038]
should be 1.0, predicted: [ 1.18094933], target predicted: [ 0.74883366]
Episode : 625 Replay Buffer 50000
TOTAL REWARD @ 625-th Episode  : Reward 1
Total Step: 167
start searching new action
get new action:  0.24631905555725098
start getting new

critic loss [0.04068049, 0.03810041]
critic loss [0.037810571, 0.036027327]
actor loss -71.671
train end 0.7051403522491455
should be 1.0, predicted: [ 1.26290345], target predicted: [ 1.07041192]
start searching new action
get new action:  0.23230290412902832
start getting new_q
get new_q:  0.06557297706604004
train start:
critic loss [0.040076356, 0.028539561]
critic loss [0.038843244, 0.02749677]
critic loss [0.037277214, 0.026516765]
critic loss [0.035524093, 0.025606103]
critic loss [0.033714987, 0.024769399]
actor loss -40.0019
train end 0.5362520217895508
should be 1.0, predicted: [ 0.51972103], target predicted: [ 0.6240201]
should be -1.0, predicted: [-0.95756948], target predicted: [-1.01397741]
should be -1.0, predicted: [-0.86963457], target predicted: [-0.62311786]
start searching new action
get new action:  0.2207491397857666
start getting new_q
get new_q:  0.06283283233642578
train start:
critic loss [0.034646653, 0.033636875]
critic loss [0.033878352, 0.032966692]
criti

start searching new action
get new action:  0.2573108673095703
start getting new_q
get new_q:  0.07308650016784668
train start:
critic loss [0.035548232, 0.034399502]
critic loss [0.035113659, 0.033875175]
critic loss [0.034480166, 0.033224314]
critic loss [0.033698604, 0.032479901]
critic loss [0.032832298, 0.031683896]
actor loss -53.4141
train end 0.6425039768218994
should be 1.0, predicted: [ 0.98500788], target predicted: [ 0.85661054]
should be 1.0, predicted: [ 0.84667212], target predicted: [ 0.94955504]
start searching new action
get new action:  0.23124051094055176
start getting new_q
get new_q:  0.06248068809509277
train start:
critic loss [0.12361696, 0.046534508]
critic loss [0.11571951, 0.045641482]
critic loss [0.1030122, 0.043942988]
critic loss [0.088025622, 0.041772801]
critic loss [0.073138818, 0.039465703]
actor loss -66.9981
train end 0.5557670593261719
should be 1.0, predicted: [ 0.6960144], target predicted: [ 0.65151781]
should be 1.0, predicted: [ 0.87712663], 

get new action:  0.24466419219970703
start getting new_q
get new_q:  0.06663918495178223
train start:
critic loss [0.022230741, 0.022230741]
critic loss [0.022301132, 0.022301132]
critic loss [0.022221383, 0.022221383]
critic loss [0.022005063, 0.022005063]
critic loss [0.021675797, 0.021675797]
actor loss -70.9417
train end 0.7334308624267578
start searching new action
get new action:  0.2621147632598877
start getting new_q
get new_q:  0.07259702682495117
train start:
critic loss [0.028101211, 0.024985746]
critic loss [0.027820425, 0.024898063]
critic loss [0.027401332, 0.024756812]
critic loss [0.026886195, 0.024567321]
critic loss [0.026312433, 0.024334181]
actor loss -64.4723
train end 1.0407214164733887
should be -1.0, predicted: [-1.05906296], target predicted: [-0.99550956]
should be -1.0, predicted: [-0.81724435], target predicted: [-0.38260117]
start searching new action
get new action:  0.2965390682220459
start getting new_q
get new_q:  0.0663449764251709
train start:
critic 

critic loss [0.028398478, 0.024275707]
critic loss [0.027542282, 0.023794569]
critic loss [0.026209939, 0.023061417]
critic loss [0.024601592, 0.022155559]
actor loss 42.1037
train end 0.5107204914093018
should be 1.0, predicted: [ 1.03656757], target predicted: [ 0.83374172]
should be 1.0, predicted: [ 0.95839977], target predicted: [ 0.86918789]
should be 1.0, predicted: [ 1.01216769], target predicted: [ 0.83087415]
should be 1.0, predicted: [ 0.87342775], target predicted: [ 0.75560182]
start searching new action
get new action:  0.2281801700592041
start getting new_q
get new_q:  0.08144569396972656
train start:
critic loss [0.044620566, 0.032368388]
critic loss [0.043351665, 0.031908713]
critic loss [0.040915526, 0.03113444]
critic loss [0.03763856, 0.030128287]
critic loss [0.034097284, 0.029017072]
actor loss -41.3826
train end 0.5221025943756104
should be 1.0, predicted: [ 1.00336075], target predicted: [ 0.93152845]
should be 1.0, predicted: [ 0.68185055], target predicted: [ 

get new action:  0.23099231719970703
start getting new_q
get new_q:  0.0618133544921875
train start:
critic loss [0.035086811, 0.026474696]
critic loss [0.034852006, 0.025956996]
critic loss [0.034020178, 0.025461107]
critic loss [0.032652743, 0.024961617]
critic loss [0.030916417, 0.024455458]
actor loss -56.647
train end 0.5593693256378174
should be 1.0, predicted: [ 0.80610895], target predicted: [ 0.83485472]
should be 1.0, predicted: [ 0.77213609], target predicted: [ 0.48551503]
Episode : 634 Replay Buffer 50000
TOTAL REWARD @ 634-th Episode  : Reward -1
Total Step: 128
start searching new action
get new action:  0.23342442512512207
start getting new_q
get new_q:  0.0685420036315918
train start:
critic loss [0.032854274, 0.023486257]
critic loss [0.032024838, 0.023224758]
critic loss [0.030482136, 0.022798741]
critic loss [0.028490331, 0.022261567]
critic loss [0.026319852, 0.021671327]
actor loss -13.5837
train end 0.5047273635864258
should be 1.0, predicted: [ 0.86848181], targ

critic loss [0.087052047, 0.025655214]
critic loss [0.07794071, 0.024042407]
critic loss [0.066716485, 0.022306675]
critic loss [0.05509036, 0.020728992]
actor loss -31.9084
train end 0.6749329566955566
should be 1.0, predicted: [ 0.98431534], target predicted: [ 1.08962309]
should be -1.0, predicted: [-0.37163576], target predicted: [-0.07736032]
start searching new action
get new action:  0.23734140396118164
start getting new_q
get new_q:  0.06528401374816895
train start:
critic loss [0.033147804, 0.031505588]
critic loss [0.033842266, 0.031440482]
critic loss [0.034412913, 0.031314239]
critic loss [0.034670956, 0.031010842]
critic loss [0.034532025, 0.030490506]
actor loss -16.4489
train end 0.5825634002685547
should be 1.0, predicted: [ 0.56675512], target predicted: [ 0.78900647]
Episode : 636 Replay Buffer 50000
TOTAL REWARD @ 636-th Episode  : Reward -1
Total Step: 145
start searching new action
get new action:  0.23913073539733887
start getting new_q
get new_q:  0.0730092525482

get new action:  0.24259424209594727
start getting new_q
get new_q:  0.06623601913452148
train start:
critic loss [0.075797729, 0.028272396]
critic loss [0.069984242, 0.027818885]
critic loss [0.062287733, 0.027259607]
critic loss [0.053343527, 0.026672214]
critic loss [0.044372723, 0.026181109]
actor loss -47.7233
train end 0.5435309410095215
should be -1.0, predicted: [-0.97002393], target predicted: [-0.99655473]
should be 1.0, predicted: [ 0.56701499], target predicted: [-0.06178339]
should be 1.0, predicted: [ 0.81290287], target predicted: [ 0.5766536]
start searching new action
get new action:  0.23246049880981445
start getting new_q
get new_q:  0.06888461112976074
train start:
critic loss [0.032506868, 0.028258128]
critic loss [0.0343678, 0.02915174]
critic loss [0.035082102, 0.029432924]
critic loss [0.034634702, 0.029104691]
critic loss [0.033169881, 0.028243717]
actor loss -80.7132
train end 0.5333764553070068
should be 1.0, predicted: [ 1.11768544], target predicted: [ 0.90

actor loss -27.0515
train end 0.9235942363739014
should be -1.0, predicted: [-0.45280921], target predicted: [-0.49700278]
start searching new action
get new action:  0.225358247756958
start getting new_q
get new_q:  0.07262659072875977
train start:
critic loss [0.05574999, 0.038905483]
critic loss [0.055277526, 0.038596377]
critic loss [0.054072097, 0.038152914]
critic loss [0.052268118, 0.037589595]
critic loss [0.050026119, 0.036938518]
actor loss -70.8382
train end 0.6419334411621094
should be 1.0, predicted: [ 0.55559546], target predicted: [ 0.31954995]
should be 1.0, predicted: [ 0.95674086], target predicted: [ 0.90109891]
start searching new action
get new action:  0.23451662063598633
start getting new_q
get new_q:  0.0639195442199707
train start:
critic loss [0.028009718, 0.026934296]
critic loss [0.027696846, 0.026624154]
critic loss [0.027297668, 0.026247041]
critic loss [0.026834007, 0.025825148]
critic loss [0.026318317, 0.025369229]
actor loss 53.4793
train end 0.5878708

critic loss [0.027622735, 0.026319481]
critic loss [0.027427837, 0.026066046]
critic loss [0.026975157, 0.025682786]
critic loss [0.026294116, 0.025166344]
actor loss -91.1181
train end 0.539341926574707
should be -1.0, predicted: [-1.36043191], target predicted: [-1.07238519]
start searching new action
get new action:  0.22669506072998047
start getting new_q
get new_q:  0.06394720077514648
train start:
critic loss [0.034027845, 0.034027845]
critic loss [0.033998378, 0.033998378]
critic loss [0.033835474, 0.033835474]
critic loss [0.033549178, 0.033549178]
critic loss [0.033159278, 0.033159278]
actor loss -107.389
train end 0.5141499042510986
Episode : 641 Replay Buffer 50000
TOTAL REWARD @ 641-th Episode  : Reward -1
Total Step: 19
start searching new action
get new action:  0.24260282516479492
start getting new_q
get new_q:  0.06737780570983887
train start:
critic loss [0.03095099, 0.026454374]
critic loss [0.030719105, 0.026175257]
critic loss [0.030138303, 0.025796711]
critic loss 

get new action:  0.23367857933044434
start getting new_q
get new_q:  0.06563591957092285
train start:
critic loss [0.030425118, 0.030332506]
critic loss [0.029809875, 0.029757636]
critic loss [0.029060423, 0.029036896]
critic loss [0.028233316, 0.028226793]
critic loss [0.027387513, 0.027387341]
actor loss 47.6345
train end 0.5555527210235596
should be 1.0, predicted: [ 1.19305217], target predicted: [ 1.02435493]
start searching new action
get new action:  0.2577390670776367
start getting new_q
get new_q:  0.08737754821777344
train start:
critic loss [0.034409344, 0.03062466]
critic loss [0.034057692, 0.030413805]
critic loss [0.033634521, 0.030193746]
critic loss [0.03314393, 0.029953748]
critic loss [0.032601357, 0.029695043]
actor loss -114.321
train end 0.7433671951293945
should be 1.0, predicted: [ 0.75847971], target predicted: [ 0.66504115]
Episode : 643 Replay Buffer 50000
TOTAL REWARD @ 643-th Episode  : Reward -1
Total Step: 131
start searching new action
get new action:  0.

critic loss [0.025149276, 0.021205554]
critic loss [0.024830345, 0.021091916]
critic loss [0.024293277, 0.020884447]
critic loss [0.023590496, 0.020601008]
actor loss -131.33
train end 0.5358419418334961
should be 1.0, predicted: [ 0.9056288], target predicted: [ 0.89961833]
should be -1.0, predicted: [-0.75107086], target predicted: [-0.73932791]
start searching new action
get new action:  0.23903369903564453
start getting new_q
get new_q:  0.07733273506164551
train start:
critic loss [0.048404589, 0.022296092]
critic loss [0.046970762, 0.022137793]
critic loss [0.044331118, 0.021861594]
critic loss [0.040898316, 0.021523487]
critic loss [0.037149012, 0.021186467]
actor loss -25.9909
train end 0.6294004917144775
should be 1.0, predicted: [ 0.48969078], target predicted: [ 0.20439512]
should be 1.0, predicted: [ 1.02981603], target predicted: [ 0.97063172]
Episode : 645 Replay Buffer 50000
TOTAL REWARD @ 645-th Episode  : Reward 1
Total Step: 123
start searching new action
get new acti

get new action:  0.24887871742248535
start getting new_q
get new_q:  0.0632314682006836
train start:
critic loss [0.031545117, 0.022793565]
critic loss [0.0311414, 0.022639504]
critic loss [0.030062024, 0.022343058]
critic loss [0.028477874, 0.021917045]
critic loss [0.026597995, 0.021389313]
actor loss -153.25
train end 0.4995877742767334
should be -1.0, predicted: [-0.96783549], target predicted: [-0.82726252]
should be 1.0, predicted: [ 1.16842055], target predicted: [ 1.03057277]
start searching new action
get new action:  0.23358941078186035
start getting new_q
get new_q:  0.06572413444519043
train start:
critic loss [0.022822496, 0.02050435]
critic loss [0.022701614, 0.020425776]
critic loss [0.022388572, 0.020259466]
critic loss [0.021900252, 0.020013914]
critic loss [0.021276191, 0.019702874]
actor loss -15.7171
train end 0.5095138549804688
should be 1.0, predicted: [ 0.84947312], target predicted: [ 0.93262744]
should be 1.0, predicted: [ 1.00424564], target predicted: [ 1.081

critic loss [0.030959025, 0.030959025]
actor loss -46.6452
train end 0.5286211967468262
start searching new action
get new action:  0.23771405220031738
start getting new_q
get new_q:  0.06033468246459961
train start:
critic loss [0.02933613, 0.023647552]
critic loss [0.027805941, 0.02312969]
critic loss [0.025823459, 0.022396119]
critic loss [0.023754811, 0.021569714]
critic loss [0.021903975, 0.020753432]
actor loss -80.949
train end 0.5321886539459229
should be -1.0, predicted: [-1.03308022], target predicted: [-1.02416348]
start searching new action
get new action:  0.23114705085754395
start getting new_q
get new_q:  0.06787729263305664
train start:
critic loss [0.03642929, 0.025000589]
critic loss [0.036537401, 0.025057154]
critic loss [0.035530098, 0.024857521]
critic loss [0.033636913, 0.024418421]
critic loss [0.03118021, 0.023793206]
actor loss -58.12
train end 0.5220775604248047
should be 1.0, predicted: [ 0.70377839], target predicted: [ 0.7357505]
should be 1.0, predicted: [

Episode : 650 Replay Buffer 50000
TOTAL REWARD @ 650-th Episode  : Reward 1
Total Step: 103
start searching new action
get new action:  0.2559971809387207
start getting new_q
get new_q:  0.08031558990478516
train start:
critic loss [0.037614182, 0.031268023]
critic loss [0.037356131, 0.031050757]
critic loss [0.036282301, 0.03039325]
critic loss [0.034658484, 0.029460358]
critic loss [0.032788347, 0.028383516]
actor loss -114.077
train end 0.6450581550598145
should be 1.0, predicted: [ 1.12866008], target predicted: [ 1.01096237]
should be 1.0, predicted: [ 1.28811514], target predicted: [ 1.00979316]
should be 1.0, predicted: [ 0.85864973], target predicted: [ 0.86711925]
should be 1.0, predicted: [ 0.93834686], target predicted: [ 0.57689363]
start searching new action
get new action:  0.24761009216308594
start getting new_q
get new_q:  0.07474899291992188
train start:
critic loss [0.028017912, 0.026906604]
critic loss [0.026946791, 0.026156481]
critic loss [0.02586939, 0.02538478]
c

start searching new action
get new action:  0.2203664779663086
start getting new_q
get new_q:  0.06431865692138672
train start:
critic loss [0.06700708, 0.040874939]
critic loss [0.064063109, 0.03965028]
critic loss [0.059833959, 0.038159948]
critic loss [0.054905962, 0.03657959]
critic loss [0.049824607, 0.035046875]
actor loss -173.519
train end 0.5797197818756104
should be 1.0, predicted: [ 0.41107839], target predicted: [ 0.05055469]
should be -1.0, predicted: [-0.90891337], target predicted: [-0.70906436]
Episode : 652 Replay Buffer 50000
TOTAL REWARD @ 652-th Episode  : Reward 1
Total Step: 152
start searching new action
get new action:  0.2427976131439209
start getting new_q
get new_q:  0.06510376930236816
train start:
critic loss [0.041209925, 0.019624867]
critic loss [0.039797373, 0.020028299]
critic loss [0.036695365, 0.020065797]
critic loss [0.032675438, 0.019794462]
critic loss [0.028443906, 0.019298483]
actor loss -25.0565
train end 0.5812087059020996
should be 1.0, predi

get new action:  0.23613715171813965
start getting new_q
get new_q:  0.06420016288757324
train start:
critic loss [0.017527796, 0.016016116]
critic loss [0.017281685, 0.015741462]
critic loss [0.017219968, 0.01559633]
critic loss [0.01719144, 0.015509645]
critic loss [0.017083488, 0.015426615]
actor loss 38.6636
train end 0.5351436138153076
should be 1.0, predicted: [ 0.92036432], target predicted: [ 0.92007452]
should be -1.0, predicted: [-0.99252629], target predicted: [-1.10694396]
should be 1.0, predicted: [ 0.98983753], target predicted: [ 0.97440654]
should be -1.0, predicted: [-0.6935904], target predicted: [-0.70365673]
start searching new action
get new action:  0.3122391700744629
start getting new_q
get new_q:  0.06773567199707031
train start:
critic loss [0.020328559, 0.018048247]
critic loss [0.020387374, 0.018281976]
critic loss [0.020180758, 0.018293211]
critic loss [0.019731719, 0.018086337]
critic loss [0.019085106, 0.017689493]
actor loss -88.3292
train end 0.496768236

critic loss [0.028479706, 0.021498043]
actor loss 57.7559
train end 0.6666104793548584
should be 1.0, predicted: [ 0.58537614], target predicted: [ 0.49285653]
should be -1.0, predicted: [-0.87563258], target predicted: [-1.0575484]
start searching new action
get new action:  0.26067304611206055
start getting new_q
get new_q:  0.07984590530395508
train start:
critic loss [0.032542571, 0.032077819]
critic loss [0.033804759, 0.033105277]
critic loss [0.034639012, 0.033766326]
critic loss [0.034972765, 0.034019958]
critic loss [0.034804352, 0.033872575]
actor loss -325.231
train end 0.6154861450195312
should be -1.0, predicted: [-0.93273354], target predicted: [-1.0835005]
start searching new action
get new action:  0.23143935203552246
start getting new_q
get new_q:  0.06179046630859375
train start:
critic loss [0.032533981, 0.032145485]
critic loss [0.032107554, 0.031753525]
critic loss [0.031519432, 0.031222124]
critic loss [0.030835047, 0.03060575]
critic loss [0.030109908, 0.029950099

critic loss [0.023880024, 0.01995334]
critic loss [0.023317747, 0.020007208]
critic loss [0.022551775, 0.020030145]
critic loss [0.021726526, 0.02002025]
actor loss -291.11
train end 0.6233420372009277
should be 1.0, predicted: [ 1.03802311], target predicted: [ 1.09713709]
should be 1.0, predicted: [ 0.78862888], target predicted: [ 0.70355362]
start searching new action
get new action:  0.251065731048584
start getting new_q
get new_q:  0.06333804130554199
train start:
critic loss [0.056701295, 0.025143558]
critic loss [0.053687498, 0.025090009]
critic loss [0.049210362, 0.024988007]
critic loss [0.044080812, 0.024913544]
critic loss [0.039100289, 0.024924407]
actor loss -178.551
train end 0.5407321453094482
should be 1.0, predicted: [ 0.77041936], target predicted: [ 0.44953936]
should be 1.0, predicted: [ 0.58368528], target predicted: [-0.10371295]
should be 1.0, predicted: [ 1.03797996], target predicted: [ 0.00859618]
should be 1.0, predicted: [ 0.915259], target predicted: [ 0.8

get new action:  0.2270677089691162
start getting new_q
get new_q:  0.06750297546386719
train start:
critic loss [0.065903701, 0.025473803]
critic loss [0.059140701, 0.024480756]
critic loss [0.050614871, 0.023424599]
critic loss [0.04221978, 0.022520863]
critic loss [0.035440758, 0.021952532]
actor loss 1.22491
train end 0.551213264465332
should be 1.0, predicted: [ 0.64236426], target predicted: [ 0.64109033]
should be 1.0, predicted: [ 0.57196856], target predicted: [ 0.43261442]
should be 1.0, predicted: [ 1.07363784], target predicted: [ 1.19236112]
start searching new action
get new action:  0.22489666938781738
start getting new_q
get new_q:  0.08395648002624512
train start:
critic loss [0.038687419, 0.03779396]
critic loss [0.039099552, 0.038052499]
critic loss [0.039190445, 0.03809179]
critic loss [0.038935684, 0.037885658]
critic loss [0.038358837, 0.037441608]
actor loss -73.6469
train end 0.692133903503418
should be 1.0, predicted: [ 1.04904139], target predicted: [ 1.056880

critic loss [0.019260738, 0.018401414]
actor loss 32.4411
train end 0.5195639133453369
should be 1.0, predicted: [ 0.81016183], target predicted: [ 0.98413706]
start searching new action
get new action:  0.2299973964691162
start getting new_q
get new_q:  0.06670308113098145
train start:
critic loss [0.031151272, 0.029649965]
critic loss [0.031146515, 0.029871441]
critic loss [0.031013494, 0.029959694]
critic loss [0.030751182, 0.029907756]
critic loss [0.030363321, 0.029713297]
actor loss -133.679
train end 0.5494968891143799
should be -1.0, predicted: [-0.90592408], target predicted: [-0.94751215]
should be 1.0, predicted: [ 0.83192551], target predicted: [ 0.90226799]
start searching new action
get new action:  0.2521543502807617
start getting new_q
get new_q:  0.07563638687133789
train start:
critic loss [0.029888196, 0.027372515]
critic loss [0.029296145, 0.026969012]
critic loss [0.028373029, 0.026351128]
critic loss [0.027237087, 0.025593553]
critic loss [0.026012214, 0.024771923

critic loss [0.036096118, 0.031549275]
actor loss -187.527
train end 0.5876126289367676
should be -1.0, predicted: [-0.71238756], target predicted: [-0.7138474]
start searching new action
get new action:  0.2399592399597168
start getting new_q
get new_q:  0.06667566299438477
train start:
critic loss [0.042046003, 0.031150388]
critic loss [0.041513301, 0.031059869]
critic loss [0.040622748, 0.030787462]
critic loss [0.039458148, 0.030366223]
critic loss [0.038106658, 0.029829785]
actor loss -123.688
train end 0.5497539043426514
should be 1.0, predicted: [ 0.49754581], target predicted: [ 0.30837244]
start searching new action
get new action:  0.2472236156463623
start getting new_q
get new_q:  0.06728196144104004
train start:
critic loss [0.024811212, 0.021225272]
critic loss [0.023741225, 0.020959765]
critic loss [0.022338215, 0.020535719]
critic loss [0.020970961, 0.020012585]
critic loss [0.019857498, 0.019425578]
actor loss -81.9881
train end 0.5875213146209717
should be 1.0, predict

critic loss [0.017431378, 0.017431378]
critic loss [0.017626975, 0.017626975]
critic loss [0.017625464, 0.017625464]
critic loss [0.017429518, 0.017429518]
actor loss -311.665
train end 0.6716394424438477
start searching new action
get new action:  0.2410109043121338
start getting new_q
get new_q:  0.06976008415222168
train start:
critic loss [0.087670058, 0.032634892]
critic loss [0.083393544, 0.031881206]
critic loss [0.076413102, 0.03080322]
critic loss [0.067921758, 0.029600078]
critic loss [0.058478948, 0.028396279]
actor loss -208.595
train end 0.5764639377593994
should be 1.0, predicted: [ 0.06731166], target predicted: [ 0.61669856]
should be -1.0, predicted: [-1.01834559], target predicted: [-0.72935748]
should be 1.0, predicted: [ 0.10205771], target predicted: [ 0.91993564]
start searching new action
get new action:  0.23712849617004395
start getting new_q
get new_q:  0.06705617904663086
train start:
critic loss [0.021767939, 0.019003216]
critic loss [0.022242138, 0.01936755

critic loss [0.021820609, 0.021546792]
critic loss [0.021603215, 0.021378461]
critic loss [0.021315763, 0.021148756]
critic loss [0.020978026, 0.020868093]
critic loss [0.020606957, 0.020545904]
actor loss -226.249
train end 1.0978105068206787
should be -1.0, predicted: [-1.01287544], target predicted: [-1.0357188]
start searching new action
get new action:  0.29270434379577637
start getting new_q
get new_q:  0.07927560806274414
train start:
critic loss [0.036373224, 0.032840159]
critic loss [0.035803158, 0.03246199]
critic loss [0.034927703, 0.031983741]
critic loss [0.03386651, 0.031449843]
critic loss [0.032740533, 0.030900065]
actor loss -150.011
train end 0.6286232471466064
should be 1.0, predicted: [ 0.88721663], target predicted: [ 0.78307772]
should be 1.0, predicted: [ 0.92169839], target predicted: [ 0.88647503]
start searching new action
get new action:  0.24625205993652344
start getting new_q
get new_q:  0.07123970985412598
train start:
critic loss [0.03621475, 0.024383835]

critic loss [0.03293018, 0.019256946]
actor loss -118.893
train end 0.5355591773986816
should be -1.0, predicted: [-1.22396743], target predicted: [-1.233724]
should be 1.0, predicted: [ 0.78154218], target predicted: [ 0.40197751]
should be -1.0, predicted: [-0.41578633], target predicted: [-0.79431909]
should be -1.0, predicted: [-0.90045905], target predicted: [-0.88119715]
should be 1.0, predicted: [ 1.08178854], target predicted: [ 0.84913462]
Episode : 669 Replay Buffer 50000
TOTAL REWARD @ 669-th Episode  : Reward 1
Total Step: 120
start searching new action
get new action:  0.24762940406799316
start getting new_q
get new_q:  0.08150959014892578
train start:
critic loss [0.08360821, 0.028275732]
critic loss [0.078082234, 0.0279947]
critic loss [0.066582605, 0.026545905]
critic loss [0.052708037, 0.024437262]
critic loss [0.039526165, 0.022189284]
actor loss -142.679
train end 0.7243711948394775
should be -1.0, predicted: [-0.59969193], target predicted: [-0.77115136]
should be -

get new action:  0.23234820365905762
start getting new_q
get new_q:  0.06977033615112305
train start:
critic loss [0.11746579, 0.0409033]
critic loss [0.10904716, 0.040331118]
critic loss [0.096912898, 0.039511051]
critic loss [0.083429739, 0.03874607]
critic loss [0.070552915, 0.038257066]
actor loss 21.9999
train end 0.5225567817687988
should be 1.0, predicted: [ 1.08603907], target predicted: [ 1.13109779]
should be 1.0, predicted: [ 1.0047617], target predicted: [ 0.78328091]
should be 1.0, predicted: [ 1.14579177], target predicted: [ 0.83670652]
should be 1.0, predicted: [ 0.77087396], target predicted: [-0.33272594]
should be 1.0, predicted: [ 1.0549289], target predicted: [ 1.11078918]
should be 1.0, predicted: [ 0.97059333], target predicted: [ 0.72084665]
start searching new action
get new action:  0.23385977745056152
start getting new_q
get new_q:  0.07036638259887695
train start:
critic loss [0.020387972, 0.019709613]
critic loss [0.020877924, 0.02046529]
critic loss [0.021

get new action:  0.22963237762451172
start getting new_q
get new_q:  0.06543326377868652
train start:
critic loss [0.069658771, 0.028310698]
critic loss [0.065036759, 0.027854025]
critic loss [0.059380196, 0.027404509]
critic loss [0.053674497, 0.027105065]
critic loss [0.048163388, 0.026962217]
actor loss -201.091
train end 0.5739326477050781
should be 1.0, predicted: [ 1.04672801], target predicted: [ 1.06813669]
should be -1.0, predicted: [-0.31605652], target predicted: [-0.40990609]
should be -1.0, predicted: [-0.57906568], target predicted: [-0.39177325]
start searching new action
get new action:  0.23796653747558594
start getting new_q
get new_q:  0.06878352165222168
train start:
critic loss [0.022143176, 0.019631002]
critic loss [0.02242841, 0.019813253]
critic loss [0.022175148, 0.019747268]
critic loss [0.02145713, 0.019437566]
critic loss [0.020404479, 0.018920537]
actor loss -203.707
train end 0.5491101741790771
should be -1.0, predicted: [-1.03808331], target predicted: [-

Episode : 674 Replay Buffer 50000
TOTAL REWARD @ 674-th Episode  : Reward -1
Total Step: 9
start searching new action
get new action:  0.24764466285705566
start getting new_q
get new_q:  0.06893801689147949
train start:
critic loss [0.033652369, 0.033652369]
critic loss [0.033390466, 0.033390466]
critic loss [0.032877736, 0.032877736]
critic loss [0.032170139, 0.032170139]
critic loss [0.031327285, 0.031327285]
actor loss -398.825
train end 0.6789746284484863
start searching new action
get new action:  0.262007474899292
start getting new_q
get new_q:  0.09052658081054688
train start:
critic loss [0.092537038, 0.039224513]
critic loss [0.088944532, 0.037953131]
critic loss [0.080583081, 0.035780787]
critic loss [0.069123164, 0.033077173]
critic loss [0.056468356, 0.030226458]
actor loss 43.4606
train end 0.7846043109893799
should be 1.0, predicted: [ 0.4052448], target predicted: [ 0.12782587]
should be 1.0, predicted: [ 0.85339355], target predicted: [ 0.91169333]
should be 1.0, predic

get new action:  0.22878241539001465
start getting new_q
get new_q:  0.06644582748413086
train start:
critic loss [0.021312844, 0.019740548]
critic loss [0.021248762, 0.019728627]
critic loss [0.02103149, 0.019657381]
critic loss [0.020690296, 0.0195287]
critic loss [0.020264219, 0.019348357]
actor loss -254.619
train end 0.5151681900024414
should be 1.0, predicted: [ 0.98407763], target predicted: [ 0.73655874]
should be -1.0, predicted: [-1.10948431], target predicted: [-0.72587979]
should be -1.0, predicted: [-0.95570248], target predicted: [-0.95244139]
start searching new action
get new action:  0.23383378982543945
start getting new_q
get new_q:  0.06638169288635254
train start:
critic loss [0.13910483, 0.030420475]
critic loss [0.13152964, 0.029514801]
critic loss [0.11906576, 0.028104708]
critic loss [0.10337819, 0.026433486]
critic loss [0.087010808, 0.024808701]
actor loss -64.4172
train end 0.5327675342559814
should be 1.0, predicted: [ 1.0802623], target predicted: [ 0.94485

critic loss [0.024642628, 0.019461399]
actor loss -325.003
train end 0.5244548320770264
should be 1.0, predicted: [ 0.43683395], target predicted: [ 0.14914404]
Episode : 678 Replay Buffer 50000
TOTAL REWARD @ 678-th Episode  : Reward 1
Total Step: 148
start searching new action
get new action:  0.2976036071777344
start getting new_q
get new_q:  0.07075190544128418
train start:
critic loss [0.017164934, 0.017164934]
critic loss [0.017037109, 0.017037109]
critic loss [0.01680674, 0.01680674]
critic loss [0.016501352, 0.016501352]
critic loss [0.016150312, 0.016150312]
actor loss -245.41
train end 1.1744909286499023
start searching new action
get new action:  0.25491762161254883
start getting new_q
get new_q:  0.0745687484741211
train start:
critic loss [0.028116304, 0.028116304]
critic loss [0.027935799, 0.027935799]
critic loss [0.027689239, 0.027689239]
critic loss [0.027371816, 0.027371816]
critic loss [0.026985552, 0.026985552]
actor loss -25.4827
train end 0.7260186672210693
start 

Episode : 680 Replay Buffer 50000
TOTAL REWARD @ 680-th Episode  : Reward -1
Total Step: 147
start searching new action
get new action:  0.25392627716064453
start getting new_q
get new_q:  0.0759439468383789
train start:
critic loss [0.034460232, 0.023796951]
critic loss [0.032093901, 0.023087019]
critic loss [0.029464483, 0.022404205]
critic loss [0.026968241, 0.021821341]
critic loss [0.024907893, 0.02138334]
actor loss -215.323
train end 0.5952901840209961
should be 1.0, predicted: [ 0.9774127], target predicted: [ 0.75883406]
should be 1.0, predicted: [ 0.48157173], target predicted: [ 0.36925212]
should be 1.0, predicted: [ 0.71966386], target predicted: [ 0.82255608]
start searching new action
get new action:  0.24051523208618164
start getting new_q
get new_q:  0.06857419013977051
train start:
critic loss [0.02146434, 0.020218004]
critic loss [0.021705005, 0.020514306]
critic loss [0.021893211, 0.020717949]
critic loss [0.021966463, 0.020798897]
critic loss [0.021900935, 0.020753

get new action:  0.25241708755493164
start getting new_q
get new_q:  0.08231401443481445
train start:
critic loss [0.023245312, 0.021815311]
critic loss [0.021976754, 0.020556323]
critic loss [0.020939734, 0.019467453]
critic loss [0.020176239, 0.018587429]
critic loss [0.019680336, 0.017925523]
actor loss -177.071
train end 0.9747169017791748
should be 1.0, predicted: [ 0.98436958], target predicted: [ 0.90774757]
should be 1.0, predicted: [ 0.80695367], target predicted: [ 0.89656454]
should be 1.0, predicted: [ 0.8555966], target predicted: [ 1.03181124]
Episode : 682 Replay Buffer 50000
TOTAL REWARD @ 682-th Episode  : Reward -1
Total Step: 131
start searching new action
get new action:  0.2619962692260742
start getting new_q
get new_q:  0.08034253120422363
train start:
critic loss [0.035457447, 0.035457447]
critic loss [0.03462838, 0.03462838]
critic loss [0.033855677, 0.033855677]
critic loss [0.033122811, 0.033122811]
critic loss [0.032402232, 0.032402232]
actor loss -334.924
tr

critic loss [0.019822948, 0.019821618]
actor loss 30.2424
train end 0.7373006343841553
should be 1.0, predicted: [ 1.08229744], target predicted: [ 1.03907239]
start searching new action
get new action:  0.23580431938171387
start getting new_q
get new_q:  0.06606101989746094
train start:
critic loss [0.029973924, 0.025551204]
critic loss [0.029556112, 0.025254413]
critic loss [0.028836546, 0.024898391]
critic loss [0.027934046, 0.02452109]
critic loss [0.026966715, 0.02415077]
actor loss -30.2899
train end 0.6076722145080566
should be 1.0, predicted: [ 0.99881679], target predicted: [ 0.86081266]
should be 1.0, predicted: [ 0.80973774], target predicted: [ 0.63455403]
Episode : 684 Replay Buffer 50000
TOTAL REWARD @ 684-th Episode  : Reward -1
Total Step: 119
start searching new action
get new action:  0.2422316074371338
start getting new_q
get new_q:  0.06835746765136719
train start:
critic loss [0.14909202, 0.038167875]
critic loss [0.14357865, 0.037525855]
critic loss [0.1331947, 0.

critic loss [0.025062315, 0.020624351]
actor loss -225.87
train end 0.5049772262573242
should be -1.0, predicted: [-0.93436646], target predicted: [-1.007617]
should be 1.0, predicted: [ 0.86383349], target predicted: [ 0.90795279]
should be 1.0, predicted: [ 0.93244278], target predicted: [ 0.91289961]
should be -1.0, predicted: [-0.61863327], target predicted: [-0.34922928]
start searching new action
get new action:  0.2395458221435547
start getting new_q
get new_q:  0.06695079803466797
train start:
critic loss [0.031182563, 0.024690943]
critic loss [0.029879428, 0.024200866]
critic loss [0.028212765, 0.023564491]
critic loss [0.026383037, 0.022848651]
critic loss [0.024612252, 0.022116706]
actor loss -299.176
train end 0.5358562469482422
should be 1.0, predicted: [ 0.87675214], target predicted: [ 1.01591802]
should be -1.0, predicted: [-0.73048896], target predicted: [-0.43974045]
start searching new action
get new action:  0.23569297790527344
start getting new_q
get new_q:  0.0661

get new action:  0.23108935356140137
start getting new_q
get new_q:  0.06412839889526367
train start:
critic loss [0.021912144, 0.021904878]
critic loss [0.021269448, 0.021217844]
critic loss [0.020667613, 0.020532228]
critic loss [0.020131579, 0.019878404]
critic loss [0.019662712, 0.01926754]
actor loss 11.1415
train end 0.5293035507202148
should be 1.0, predicted: [ 0.83449471], target predicted: [ 0.79446]
start searching new action
get new action:  0.23668122291564941
start getting new_q
get new_q:  0.06221795082092285
train start:
critic loss [0.026714966, 0.021823719]
critic loss [0.025983227, 0.0213487]
critic loss [0.025072968, 0.020878371]
critic loss [0.024063474, 0.020433901]
critic loss [0.023040943, 0.020027038]
actor loss -171.736
train end 0.5282919406890869
should be 1.0, predicted: [ 0.82306308], target predicted: [ 0.78935003]
start searching new action
get new action:  0.23155951499938965
start getting new_q
get new_q:  0.06477189064025879
train start:
critic loss [

get new action:  0.2370450496673584
start getting new_q
get new_q:  0.06269264221191406
train start:
critic loss [0.03323682, 0.019088108]
critic loss [0.031921279, 0.019153066]
critic loss [0.029114502, 0.018831596]
critic loss [0.025682138, 0.018270792]
critic loss [0.022261266, 0.01761868]
actor loss -58.2217
train end 0.5253036022186279
should be -1.0, predicted: [-0.81051975], target predicted: [-0.77373904]
should be 1.0, predicted: [ 0.80715662], target predicted: [ 0.81283039]
start searching new action
get new action:  0.23930025100708008
start getting new_q
get new_q:  0.06271243095397949
train start:
critic loss [0.029702382, 0.025300644]
critic loss [0.029698756, 0.0253966]
critic loss [0.02923977, 0.025331961]
critic loss [0.028422309, 0.025115717]
critic loss [0.02735962, 0.024767604]
actor loss -258.142
train end 0.7005715370178223
should be 1.0, predicted: [ 1.34776044], target predicted: [ 1.13019729]
start searching new action
get new action:  0.29944634437561035
star

get new action:  0.23412156105041504
start getting new_q
get new_q:  0.07499933242797852
train start:
critic loss [0.023293138, 0.019808711]
critic loss [0.023643265, 0.020308351]
critic loss [0.023632044, 0.0206198]
critic loss [0.023302341, 0.02072975]
critic loss [0.022751503, 0.020662664]
actor loss -150.251
train end 0.7380621433258057
should be 1.0, predicted: [ 0.95029944], target predicted: [ 0.98389494]
should be -1.0, predicted: [-1.113006], target predicted: [-0.73226964]
should be -1.0, predicted: [-1.39517903], target predicted: [-1.31818616]
start searching new action
get new action:  0.2294785976409912
start getting new_q
get new_q:  0.06213808059692383
train start:
critic loss [0.072095759, 0.029682469]
critic loss [0.066444002, 0.028667793]
critic loss [0.057788506, 0.027193852]
critic loss [0.048615403, 0.025622785]
critic loss [0.040224232, 0.024180889]
actor loss -231.265
train end 0.6283657550811768
should be 1.0, predicted: [ 0.63654768], target predicted: [ 0.636

get new action:  0.24151086807250977
start getting new_q
get new_q:  0.06358814239501953
train start:
critic loss [0.020553952, 0.013671885]
critic loss [0.019421343, 0.013607079]
critic loss [0.017867591, 0.013497256]
critic loss [0.016392285, 0.013392513]
critic loss [0.015124556, 0.013305791]
actor loss -94.0378
train end 0.5747628211975098
should be 1.0, predicted: [ 0.8525188], target predicted: [ 0.79930395]
should be 1.0, predicted: [ 0.89860827], target predicted: [ 0.78055239]
start searching new action
get new action:  0.22510814666748047
start getting new_q
get new_q:  0.06818580627441406
train start:
critic loss [0.030088592, 0.02929342]
critic loss [0.030247824, 0.029469168]
critic loss [0.030183049, 0.029422209]
critic loss [0.029891554, 0.029168598]
critic loss [0.029384408, 0.028727228]
actor loss -274.948
train end 0.5386965274810791
should be 1.0, predicted: [ 0.86164564], target predicted: [ 0.20919374]
should be -1.0, predicted: [-1.0183655], target predicted: [-1.1

get new action:  0.24104714393615723
start getting new_q
get new_q:  0.06662797927856445
train start:
critic loss [0.018856347, 0.018555984]
critic loss [0.019103905, 0.018738233]
critic loss [0.019265402, 0.01885673]
critic loss [0.019312268, 0.018887024]
critic loss [0.019234104, 0.018819392]
actor loss -179.099
train end 0.6024997234344482
should be 1.0, predicted: [ 1.06159985], target predicted: [ 0.89487815]
start searching new action
get new action:  0.25801563262939453
start getting new_q
get new_q:  0.07037615776062012
train start:
critic loss [0.036700897, 0.02852024]
critic loss [0.035452642, 0.02801872]
critic loss [0.033578791, 0.02718877]
critic loss [0.031354897, 0.026162915]
critic loss [0.029051054, 0.025075737]
actor loss -284.333
train end 0.7479407787322998
should be -1.0, predicted: [-0.85582405], target predicted: [-0.74180019]
start searching new action
get new action:  0.241715669631958
start getting new_q
get new_q:  0.07210636138916016
train start:
critic loss

critic loss [0.017977016, 0.017977016]
actor loss -255.371
train end 0.5195028781890869
start searching new action
get new action:  0.2222433090209961
start getting new_q
get new_q:  0.06093597412109375
train start:
critic loss [0.057433654, 0.025823204]
critic loss [0.054175861, 0.025020361]
critic loss [0.048428625, 0.023929883]
critic loss [0.041294698, 0.022715092]
critic loss [0.034070931, 0.021562148]
actor loss -107.721
train end 0.537623405456543
should be 1.0, predicted: [ 0.66581786], target predicted: [ 0.80524206]
should be 1.0, predicted: [ 0.74869066], target predicted: [ 0.87770897]
should be 1.0, predicted: [ 0.73357177], target predicted: [ 0.65422857]
should be 1.0, predicted: [ 0.83714676], target predicted: [ 0.72607964]
should be 1.0, predicted: [ 0.45811242], target predicted: [ 0.78352684]
start searching new action
get new action:  0.23769140243530273
start getting new_q
get new_q:  0.07170939445495605
train start:
critic loss [0.021445436, 0.017006341]
critic l

Episode : 699 Replay Buffer 50000
TOTAL REWARD @ 699-th Episode  : Reward 1
Total Step: 94
start searching new action
get new action:  0.25205278396606445
start getting new_q
get new_q:  0.06728363037109375
train start:
critic loss [0.023359045, 0.021246988]
critic loss [0.023516502, 0.021597695]
critic loss [0.023616286, 0.021963708]
critic loss [0.023591865, 0.022265567]
critic loss [0.023413526, 0.022438657]
actor loss -74.3452
train end 0.617896556854248
should be 1.0, predicted: [ 0.82007468], target predicted: [ 0.74768674]
should be 1.0, predicted: [ 1.00643754], target predicted: [ 0.83545828]
start searching new action
get new action:  0.23999667167663574
start getting new_q
get new_q:  0.0642995834350586
train start:
critic loss [0.083721101, 0.033728532]
critic loss [0.07594192, 0.033550814]
critic loss [0.063878611, 0.032903831]
critic loss [0.050976202, 0.032146901]
critic loss [0.040383644, 0.031522114]
actor loss -268.879
train end 0.5600693225860596
should be 1.0, predi

critic loss [0.027509268, 0.027509268]
actor loss -108.224
train end 0.5613975524902344
start searching new action
get new action:  0.24857306480407715
start getting new_q
get new_q:  0.06759905815124512
train start:
critic loss [0.04392232, 0.032361224]
critic loss [0.042572491, 0.031535648]
critic loss [0.039979696, 0.030266974]
critic loss [0.036618613, 0.028741203]
critic loss [0.033064615, 0.027161349]
actor loss 17.089
train end 0.5375375747680664
should be 1.0, predicted: [ 0.88802993], target predicted: [ 0.78704554]
should be 1.0, predicted: [ 1.20017874], target predicted: [ 1.0156219]
should be -1.0, predicted: [-0.24379773], target predicted: [-0.81104606]
start searching new action
get new action:  0.232804536819458
start getting new_q
get new_q:  0.06457376480102539
train start:
critic loss [0.033257715, 0.020546675]
critic loss [0.030110374, 0.019404691]
critic loss [0.027029963, 0.018441997]
critic loss [0.024420151, 0.01773385]
critic loss [0.022520822, 0.017299265]
ac

critic loss [0.029009029, 0.028997071]
actor loss -68.0376
train end 0.62386155128479
should be -1.0, predicted: [-0.83208603], target predicted: [-0.94865286]
start searching new action
get new action:  0.22306323051452637
start getting new_q
get new_q:  0.06879329681396484
train start:
critic loss [0.02281937, 0.019864922]
critic loss [0.022300171, 0.019719481]
critic loss [0.02162932, 0.019520659]
critic loss [0.020882005, 0.019291224]
critic loss [0.020152958, 0.019052465]
actor loss -28.7328
train end 0.5461711883544922
should be -1.0, predicted: [-1.07201934], target predicted: [-1.12368858]
start searching new action
get new action:  0.22698402404785156
start getting new_q
get new_q:  0.06819438934326172
train start:
critic loss [0.043835022, 0.025885798]
critic loss [0.041883279, 0.02563348]
critic loss [0.038149521, 0.025053486]
critic loss [0.033939589, 0.024328189]
critic loss [0.030626183, 0.023607537]
actor loss -263.355
train end 0.5405144691467285
should be -1.0, predict

critic loss [0.018172355, 0.01813357]
actor loss -196.467
train end 0.6638858318328857
should be -1.0, predicted: [-1.28697491], target predicted: [-1.0096333]
should be 1.0, predicted: [ 1.06274629], target predicted: [ 0.97130716]
start searching new action
get new action:  0.24088191986083984
start getting new_q
get new_q:  0.06589984893798828
train start:
critic loss [0.026103629, 0.023620574]
critic loss [0.025586072, 0.023411691]
critic loss [0.024789274, 0.023055285]
critic loss [0.02384503, 0.022600882]
critic loss [0.022881327, 0.022098131]
actor loss -301.48
train end 0.5531594753265381
should be 1.0, predicted: [ 1.29627693], target predicted: [ 1.12631869]
should be -1.0, predicted: [-1.15328848], target predicted: [-0.98756391]
start searching new action
get new action:  0.22894811630249023
start getting new_q
get new_q:  0.08093881607055664
train start:
critic loss [0.037162073, 0.037162073]
critic loss [0.036119826, 0.036119826]
critic loss [0.034983527, 0.034983527]
cri

get new action:  0.24915242195129395
start getting new_q
get new_q:  0.061792612075805664
train start:
critic loss [0.028092891, 0.015181193]
critic loss [0.026941249, 0.014755072]
critic loss [0.024999265, 0.014265067]
critic loss [0.022643313, 0.013759833]
critic loss [0.020132773, 0.01326875]
actor loss -78.3972
train end 0.5745999813079834
should be 1.0, predicted: [ 0.93855274], target predicted: [ 0.79049897]
should be -1.0, predicted: [-0.7848596], target predicted: [-0.84700024]
should be -1.0, predicted: [-1.08998895], target predicted: [-1.13170028]
start searching new action
get new action:  0.2485671043395996
start getting new_q
get new_q:  0.07208609580993652
train start:
critic loss [0.032308478, 0.023458172]
critic loss [0.031366844, 0.023366041]
critic loss [0.029987322, 0.023192845]
critic loss [0.028387008, 0.022970349]
critic loss [0.026753034, 0.022726964]
actor loss 167.648
train end 0.8811061382293701
should be 1.0, predicted: [ 1.09732008], target predicted: [ 0.

get new action:  0.24611163139343262
start getting new_q
get new_q:  0.07342386245727539
train start:
critic loss [0.020849418, 0.020615313]
critic loss [0.020924818, 0.020592419]
critic loss [0.020710327, 0.020289103]
critic loss [0.020228069, 0.019738805]
critic loss [0.019527115, 0.018997598]
actor loss -30.5584
train end 0.5629613399505615
should be 1.0, predicted: [ 0.85026366], target predicted: [ 0.79184133]
start searching new action
get new action:  0.24536538124084473
start getting new_q
get new_q:  0.06796813011169434
train start:
critic loss [0.030864168, 0.028701531]
critic loss [0.030159242, 0.028154856]
critic loss [0.029336065, 0.027597155]
critic loss [0.028501878, 0.027079802]
critic loss [0.027740553, 0.026633436]
actor loss -230.098
train end 0.5551896095275879
should be -1.0, predicted: [-0.99262476], target predicted: [-1.17774868]
should be 1.0, predicted: [ 0.8714956], target predicted: [ 0.89170355]
should be 1.0, predicted: [ 0.75936902], target predicted: [ 0

Episode : 710 Replay Buffer 50000
TOTAL REWARD @ 710-th Episode  : Reward 1
Total Step: 95
start searching new action
get new action:  0.24829435348510742
start getting new_q
get new_q:  0.06909632682800293
train start:
critic loss [0.061022468, 0.023056578]
critic loss [0.057753704, 0.022900786]
critic loss [0.050936297, 0.022082467]
critic loss [0.042551994, 0.020825341]
critic loss [0.034101304, 0.019372914]
actor loss -106.526
train end 0.5263664722442627
should be 1.0, predicted: [ 0.95956099], target predicted: [ 0.78874838]
should be 1.0, predicted: [ 1.21109855], target predicted: [ 0.9432202]
should be 1.0, predicted: [ 0.90677607], target predicted: [ 0.45929313]
should be 1.0, predicted: [ 1.24924624], target predicted: [ 1.07219255]
should be 1.0, predicted: [ 0.99070615], target predicted: [ 0.82499087]
start searching new action
get new action:  0.23826384544372559
start getting new_q
get new_q:  0.07466983795166016
train start:
critic loss [0.057100009, 0.018960692]
crit

critic loss [0.033106435, 0.032664079]
critic loss [0.033303034, 0.03277272]
critic loss [0.033351377, 0.032753162]
critic loss [0.033227704, 0.032588176]
actor loss -113.171
train end 0.581071138381958
should be 1.0, predicted: [ 1.11455393], target predicted: [ 1.04801083]
start searching new action
get new action:  0.22382020950317383
start getting new_q
get new_q:  0.06698131561279297
train start:
critic loss [0.081274599, 0.04399262]
critic loss [0.079266965, 0.043563459]
critic loss [0.07516934, 0.042813286]
critic loss [0.069600135, 0.041807834]
critic loss [0.063296579, 0.04063813]
actor loss -361.779
train end 0.5253839492797852
should be -1.0, predicted: [-1.15813303], target predicted: [-1.10672474]
should be 1.0, predicted: [ 0.2473045], target predicted: [-0.21595171]
Episode : 712 Replay Buffer 50000
TOTAL REWARD @ 712-th Episode  : Reward 1
Total Step: 122
start searching new action
get new action:  0.24369049072265625
start getting new_q
get new_q:  0.06435394287109375


get new action:  0.23339557647705078
start getting new_q
get new_q:  0.07890582084655762
train start:
critic loss [0.023248695, 0.016280411]
critic loss [0.02280131, 0.016716044]
critic loss [0.022137227, 0.017100915]
critic loss [0.021377202, 0.017406289]
critic loss [0.020613361, 0.017613187]
actor loss -30.3555
train end 0.5615804195404053
should be 1.0, predicted: [ 0.97011554], target predicted: [ 0.55083448]
should be 1.0, predicted: [ 0.99525619], target predicted: [ 0.83529645]
should be 1.0, predicted: [ 0.89977646], target predicted: [ 0.51923841]
should be 1.0, predicted: [ 1.03947198], target predicted: [ 1.01629603]
start searching new action
get new action:  0.24004578590393066
start getting new_q
get new_q:  0.07121920585632324
train start:
critic loss [0.037415992, 0.030317843]
critic loss [0.037293386, 0.030283354]
critic loss [0.035426244, 0.02949295]
critic loss [0.032419875, 0.028169004]
critic loss [0.029015725, 0.02657005]
actor loss -372.472
train end 0.547894239

get new action:  0.23785686492919922
start getting new_q
get new_q:  0.06980514526367188
train start:
critic loss [0.037341736, 0.036582079]
critic loss [0.037511226, 0.036873933]
critic loss [0.037622876, 0.037043042]
critic loss [0.037645869, 0.037081316]
critic loss [0.03754773, 0.036981162]
actor loss -374.546
train end 0.5414466857910156
should be 1.0, predicted: [ 0.9743591], target predicted: [ 0.85105687]
should be -1.0, predicted: [-1.01885319], target predicted: [-1.03281677]
start searching new action
get new action:  0.23476266860961914
start getting new_q
get new_q:  0.07364368438720703
train start:
critic loss [0.025265539, 0.023449671]
critic loss [0.025114797, 0.023395739]
critic loss [0.024758413, 0.02320401]
critic loss [0.024229541, 0.022887781]
critic loss [0.023578307, 0.022466229]
actor loss -277.664
train end 0.5196595191955566
should be 1.0, predicted: [ 1.18260753], target predicted: [ 0.94749945]
should be 1.0, predicted: [ 0.97132158], target predicted: [ 0.8

actor loss -178.557
train end 0.69535231590271
should be 1.0, predicted: [ 0.9216907], target predicted: [ 0.91165024]
should be 1.0, predicted: [ 0.89755434], target predicted: [ 0.83753943]
should be 1.0, predicted: [ 0.37232187], target predicted: [ 0.06458917]
should be -1.0, predicted: [-1.05801427], target predicted: [-1.09559]
start searching new action
get new action:  0.24532270431518555
start getting new_q
get new_q:  0.07303595542907715
train start:
critic loss [0.026413092, 0.023993233]
critic loss [0.025890131, 0.023881635]
critic loss [0.025224233, 0.0236752]
critic loss [0.0245196, 0.02342473]
critic loss [0.0238261, 0.023134638]
actor loss 133.992
train end 0.5796592235565186
should be 1.0, predicted: [ 0.85550547], target predicted: [ 0.85997486]
should be 1.0, predicted: [ 0.9350971], target predicted: [ 0.88715935]
start searching new action
get new action:  0.2372739315032959
start getting new_q
get new_q:  0.06376981735229492
train start:
critic loss [0.025061943, 

critic loss [0.020850569, 0.020827927]
critic loss [0.020474376, 0.020442154]
critic loss [0.020055629, 0.020009162]
critic loss [0.019623779, 0.019558158]
actor loss -356.476
train end 0.5492103099822998
should be -1.0, predicted: [-0.78344518], target predicted: [-1.13639009]
start searching new action
get new action:  0.22486615180969238
start getting new_q
get new_q:  0.06966018676757812
train start:
critic loss [0.035513349, 0.025621435]
critic loss [0.034294602, 0.025142865]
critic loss [0.03269776, 0.024610575]
critic loss [0.030998018, 0.024088934]
critic loss [0.029427111, 0.023627281]
actor loss -155.348
train end 0.6148042678833008
should be -1.0, predicted: [-0.54209244], target predicted: [-0.38883919]
should be -1.0, predicted: [-0.66319865], target predicted: [-1.00490355]
should be 1.0, predicted: [ 0.88182646], target predicted: [ 1.00133777]
should be 1.0, predicted: [ 0.86293137], target predicted: [ 1.10657012]
should be -1.0, predicted: [-0.70526022], target predic

get new action:  0.23721575736999512
start getting new_q
get new_q:  0.0704641342163086
train start:
critic loss [0.062090136, 0.020401146]
critic loss [0.062701717, 0.021456782]
critic loss [0.058341615, 0.021984784]
critic loss [0.050030507, 0.021974403]
critic loss [0.040432516, 0.021605207]
actor loss -87.0767
train end 0.5448856353759766
should be 1.0, predicted: [ 0.0983568], target predicted: [ 0.86476755]
should be 1.0, predicted: [ 0.86695933], target predicted: [ 0.96867484]
should be 1.0, predicted: [ 0.4238528], target predicted: [ 0.47586977]
should be 1.0, predicted: [ 0.55686796], target predicted: [ 0.31839153]
start searching new action
get new action:  0.2315366268157959
start getting new_q
get new_q:  0.06430172920227051
train start:
critic loss [0.031358991, 0.030929986]
critic loss [0.031297635, 0.030748878]
critic loss [0.030546017, 0.029926866]
critic loss [0.029291242, 0.02865665]
critic loss [0.027745565, 0.027147308]
actor loss -286.334
train end 0.54034042358

critic loss [0.029214464, 0.023758931]
critic loss [0.02736246, 0.023076696]
critic loss [0.02553096, 0.022362264]
actor loss -391.051
train end 0.7850227355957031
should be -1.0, predicted: [-1.02676225], target predicted: [-0.90277892]
should be -1.0, predicted: [-1.19036138], target predicted: [-1.04512155]
should be 1.0, predicted: [ 0.99903065], target predicted: [ 0.97613865]
start searching new action
get new action:  0.23245000839233398
start getting new_q
get new_q:  0.06427693367004395
train start:
critic loss [0.028306901, 0.017996967]
critic loss [0.027903708, 0.018118225]
critic loss [0.026606444, 0.018088188]
critic loss [0.024772892, 0.017940249]
critic loss [0.022785734, 0.017715473]
actor loss -18.2963
train end 0.6525201797485352
should be 1.0, predicted: [ 0.90523046], target predicted: [ 0.96882635]
should be 1.0, predicted: [ 0.84552449], target predicted: [ 0.61832005]
should be 1.0, predicted: [ 1.10856032], target predicted: [ 1.11358678]
should be 1.0, predicte

critic loss [0.060044359, 0.025781553]
actor loss -161.732
train end 0.5992159843444824
should be 1.0, predicted: [ 1.11931574], target predicted: [ 1.04578507]
should be 1.0, predicted: [-0.06042063], target predicted: [-0.09764305]
should be 1.0, predicted: [ 0.7620098], target predicted: [ 0.66926187]
should be 1.0, predicted: [ 0.82928669], target predicted: [ 0.83030766]
start searching new action
get new action:  0.22327709197998047
start getting new_q
get new_q:  0.06294751167297363
train start:
critic loss [0.036615435, 0.02144902]
critic loss [0.037438042, 0.021406742]
critic loss [0.0364931, 0.020816136]
critic loss [0.034099922, 0.019831732]
critic loss [0.03076156, 0.018637445]
actor loss -501.703
train end 0.548159122467041
should be 1.0, predicted: [ 0.43714765], target predicted: [ 0.9690783]
should be 1.0, predicted: [ 0.86912614], target predicted: [ 0.94442832]
should be -1.0, predicted: [-0.84633046], target predicted: [-0.27268651]
should be 1.0, predicted: [ 0.8076

critic loss [0.030491885, 0.020042211]
critic loss [0.029276554, 0.019928081]
critic loss [0.027895449, 0.019789996]
critic loss [0.026467243, 0.019632278]
actor loss -284.963
train end 0.6424291133880615
should be -1.0, predicted: [-1.01403999], target predicted: [-0.99405807]
should be -1.0, predicted: [-0.59361911], target predicted: [-0.36501816]
start searching new action
get new action:  0.2529613971710205
start getting new_q
get new_q:  0.06727051734924316
train start:
critic loss [0.026174005, 0.022228528]
critic loss [0.026090547, 0.022140045]
critic loss [0.025648601, 0.021956507]
critic loss [0.024920886, 0.021692265]
critic loss [0.024001531, 0.021364836]
actor loss -143.132
train end 0.7800164222717285
should be 1.0, predicted: [ 0.99894273], target predicted: [ 0.91198528]
should be 1.0, predicted: [ 0.84578443], target predicted: [ 0.86600721]
start searching new action
get new action:  0.23244047164916992
start getting new_q
get new_q:  0.07406973838806152
train start:


Episode : 728 Replay Buffer 50000
TOTAL REWARD @ 728-th Episode  : Reward 1
Total Step: 135
start searching new action
get new action:  0.2444775104522705
start getting new_q
get new_q:  0.07004141807556152
train start:
critic loss [0.03388007, 0.03137625]
critic loss [0.033648189, 0.031449463]
critic loss [0.033121694, 0.031285539]
critic loss [0.032363273, 0.030906625]
critic loss [0.031437602, 0.030343588]
actor loss -309.886
train end 0.610539436340332
should be -1.0, predicted: [-1.11334956], target predicted: [-1.05226386]
start searching new action
get new action:  0.23749804496765137
start getting new_q
get new_q:  0.06443572044372559
train start:
critic loss [0.029440114, 0.026360178]
critic loss [0.029193504, 0.02633822]
critic loss [0.028772879, 0.026228959]
critic loss [0.028228579, 0.026046831]
critic loss [0.027602572, 0.025801759]
actor loss -309.738
train end 0.5717706680297852
should be -1.0, predicted: [-0.77386469], target predicted: [-0.91282761]
should be 1.0, pred

Episode : 730 Replay Buffer 50000
TOTAL REWARD @ 730-th Episode  : Reward 1
Total Step: 115
start searching new action
get new action:  0.24708008766174316
start getting new_q
get new_q:  0.0667262077331543
train start:
critic loss [0.022135273, 0.021793876]
critic loss [0.021878242, 0.021519255]
critic loss [0.021489838, 0.0211337]
critic loss [0.021017067, 0.020675194]
critic loss [0.020501368, 0.020178054]
actor loss -541.872
train end 0.5431458950042725
should be 1.0, predicted: [ 0.83236086], target predicted: [ 0.83132279]
should be 1.0, predicted: [ 0.95082706], target predicted: [ 0.91792154]
start searching new action
get new action:  0.23111557960510254
start getting new_q
get new_q:  0.06538105010986328
train start:
critic loss [0.02517987, 0.024319625]
critic loss [0.024915673, 0.024083275]
critic loss [0.024560049, 0.023792094]
critic loss [0.024148215, 0.023466531]
critic loss [0.023702154, 0.02311795]
actor loss -546.706
train end 0.5286397933959961
should be 1.0, predic

get new action:  0.24439573287963867
start getting new_q
get new_q:  0.06658339500427246
train start:
critic loss [0.022717655, 0.020888092]
critic loss [0.02229722, 0.020578071]
critic loss [0.021759445, 0.020228256]
critic loss [0.021165546, 0.019869994]
critic loss [0.020567358, 0.019526564]
actor loss -0.357689
train end 0.542726993560791
should be -1.0, predicted: [-1.02626264], target predicted: [-0.96623474]
should be 1.0, predicted: [ 1.07268703], target predicted: [ 0.96746737]
Episode : 732 Replay Buffer 50000
TOTAL REWARD @ 732-th Episode  : Reward 1
Total Step: 120
start searching new action
get new action:  0.248154878616333
start getting new_q
get new_q:  0.07082891464233398
train start:
critic loss [0.037418764, 0.019058902]
critic loss [0.034884598, 0.018913249]
critic loss [0.030287618, 0.018509788]
critic loss [0.025209878, 0.018005446]
critic loss [0.02091779, 0.017528929]
actor loss -217.87
train end 0.51326584815979
should be 1.0, predicted: [ 0.85652882], target p

get new action:  0.23592805862426758
start getting new_q
get new_q:  0.06720829010009766
train start:
critic loss [0.018298272, 0.018298272]
critic loss [0.018595306, 0.018595306]
critic loss [0.018767677, 0.018767677]
critic loss [0.018799853, 0.018799853]
critic loss [0.018696915, 0.018696915]
actor loss -229.639
train end 0.6123688220977783
Episode : 734 Replay Buffer 50000
TOTAL REWARD @ 734-th Episode  : Reward 1
Total Step: 148
start searching new action
get new action:  0.25577807426452637
start getting new_q
get new_q:  0.07331466674804688
train start:
critic loss [0.043175593, 0.027126044]
critic loss [0.040394522, 0.026798841]
critic loss [0.03636349, 0.026258979]
critic loss [0.032091342, 0.025624068]
critic loss [0.028405948, 0.024986416]
actor loss -376.261
train end 0.614290714263916
should be 1.0, predicted: [ 0.60302007], target predicted: [-0.24729908]
start searching new action
get new action:  0.23701000213623047
start getting new_q
get new_q:  0.06605339050292969
tr

get new action:  0.24264788627624512
start getting new_q
get new_q:  0.0698246955871582
train start:
critic loss [0.024943255, 0.022261575]
critic loss [0.024780624, 0.022226587]
critic loss [0.024325959, 0.022019126]
critic loss [0.023627792, 0.021651236]
critic loss [0.022754576, 0.021149978]
actor loss 49.0498
train end 0.968024492263794
should be 1.0, predicted: [ 0.93215412], target predicted: [ 0.9362042]
should be 1.0, predicted: [ 0.81398731], target predicted: [ 0.59229517]
start searching new action
get new action:  0.27025485038757324
start getting new_q
get new_q:  0.06616902351379395
train start:
critic loss [0.018733975, 0.017687455]
critic loss [0.01842029, 0.017411459]
critic loss [0.018014988, 0.017113607]
critic loss [0.017555349, 0.016808562]
critic loss [0.017076124, 0.016507767]
actor loss -77.7324
train end 0.7269091606140137
should be 1.0, predicted: [ 0.79886508], target predicted: [ 0.71731597]
Episode : 736 Replay Buffer 50000
TOTAL REWARD @ 736-th Episode  : 

get new action:  0.23493552207946777
start getting new_q
get new_q:  0.060671091079711914
train start:
critic loss [0.059936415, 0.032661892]
critic loss [0.058585733, 0.032044381]
critic loss [0.054886259, 0.03109757]
critic loss [0.049824983, 0.030110633]
critic loss [0.044449084, 0.029318703]
actor loss -222.964
train end 0.5181455612182617
should be 1.0, predicted: [ 0.70355529], target predicted: [ 0.36472765]
should be 1.0, predicted: [ 0.9565081], target predicted: [ 0.89148647]
should be 1.0, predicted: [ 0.62847], target predicted: [ 0.86873472]
start searching new action
get new action:  0.22681069374084473
start getting new_q
get new_q:  0.06451559066772461
train start:
critic loss [0.051724136, 0.026412465]
critic loss [0.048055083, 0.025958922]
critic loss [0.044392124, 0.025739333]
critic loss [0.041040991, 0.025743239]
critic loss [0.038192898, 0.025921963]
actor loss -124.024
train end 0.5235424041748047
should be 1.0, predicted: [ 1.03124952], target predicted: [ 0.958

critic loss [0.032859128, 0.029904075]
actor loss -758.546
train end 0.5081002712249756
should be 1.0, predicted: [ 0.69438982], target predicted: [ 0.94244295]
start searching new action
get new action:  0.24084711074829102
start getting new_q
get new_q:  0.06258821487426758
train start:
critic loss [0.030882232, 0.025797304]
critic loss [0.02917996, 0.024866313]
critic loss [0.026986178, 0.023536224]
critic loss [0.024645429, 0.022054631]
critic loss [0.022464372, 0.020651601]
actor loss -396.617
train end 0.5179145336151123
should be 1.0, predicted: [ 0.85919493], target predicted: [ 0.8620131]
should be 1.0, predicted: [ 0.84897703], target predicted: [ 0.62555146]
start searching new action
get new action:  0.2333683967590332
start getting new_q
get new_q:  0.06807518005371094
train start:
critic loss [0.066809401, 0.031803656]
critic loss [0.062552243, 0.030763645]
critic loss [0.057098787, 0.029932879]
critic loss [0.051404208, 0.029444279]
critic loss [0.046053194, 0.02932068]


get new action:  0.23054862022399902
start getting new_q
get new_q:  0.061922550201416016
train start:
critic loss [0.061129458, 0.027711201]
critic loss [0.057442043, 0.02629621]
critic loss [0.052945931, 0.02507264]
critic loss [0.048235126, 0.02412441]
critic loss [0.043691438, 0.023484962]
actor loss -160.544
train end 0.5607473850250244
should be -1.0, predicted: [-1.00211692], target predicted: [-0.95448983]
should be 1.0, predicted: [ 0.57492018], target predicted: [ 0.69964987]
should be 1.0, predicted: [ 0.58925694], target predicted: [ 0.76882797]
start searching new action
get new action:  0.228837251663208
start getting new_q
get new_q:  0.06244230270385742
train start:
critic loss [0.017308779, 0.017303223]
critic loss [0.018433612, 0.018433232]
critic loss [0.01946255, 0.019454077]
critic loss [0.02028011, 0.02025995]
critic loss [0.02080917, 0.02077939]
actor loss -243.81
train end 0.540264368057251
should be 1.0, predicted: [ 1.01439953], target predicted: [ 0.99796659]

critic loss [0.023651972, 0.023601256]
critic loss [0.023585707, 0.023562981]
critic loss [0.023410985, 0.023404311]
critic loss [0.023144495, 0.023144033]
actor loss -218.022
train end 0.5056829452514648
should be 1.0, predicted: [ 1.04548931], target predicted: [ 0.96624756]
Episode : 743 Replay Buffer 50000
TOTAL REWARD @ 743-th Episode  : Reward -1
Total Step: 122
start searching new action
get new action:  0.2696263790130615
start getting new_q
get new_q:  0.07239508628845215
train start:
critic loss [0.028597292, 0.02025307]
critic loss [0.027845126, 0.020000268]
critic loss [0.026471917, 0.019541323]
critic loss [0.024707021, 0.018932819]
critic loss [0.022789072, 0.018235505]
actor loss -104.787
train end 0.9482455253601074
should be 1.0, predicted: [ 0.79661453], target predicted: [ 0.85579616]
should be 1.0, predicted: [ 1.08651304], target predicted: [ 1.1696763]
start searching new action
get new action:  0.2679922580718994
start getting new_q
get new_q:  0.0696649551391601

critic loss [0.02646295, 0.02646295]
actor loss 195.526
train end 0.554710865020752
Episode : 745 Replay Buffer 50000
TOTAL REWARD @ 745-th Episode  : Reward -1
Total Step: 124
start searching new action
get new action:  0.2400517463684082
start getting new_q
get new_q:  0.07169914245605469
train start:
critic loss [0.029864188, 0.024285385]
critic loss [0.029617822, 0.024080805]
critic loss [0.02902136, 0.023683824]
critic loss [0.028131917, 0.023124874]
critic loss [0.027024359, 0.022445282]
actor loss -419.18
train end 0.5227518081665039
should be 1.0, predicted: [ 0.70313102], target predicted: [ 0.76802123]
start searching new action
get new action:  0.23407649993896484
start getting new_q
get new_q:  0.06528759002685547
train start:
critic loss [0.064531676, 0.029250909]
critic loss [0.063290983, 0.028888762]
critic loss [0.060419541, 0.028242229]
critic loss [0.056311365, 0.027384374]
critic loss [0.051411249, 0.026396176]
actor loss -173.81
train end 0.615856409072876
should be

critic loss [0.0202122, 0.020203229]
critic loss [0.019606845, 0.019606426]
critic loss [0.019195229, 0.019178376]
critic loss [0.018941216, 0.018888224]
actor loss -289.516
train end 0.5886657238006592
should be 1.0, predicted: [ 0.99907881], target predicted: [ 0.9311583]
Episode : 747 Replay Buffer 50000
TOTAL REWARD @ 747-th Episode  : Reward 1
Total Step: 125
start searching new action
get new action:  0.24649739265441895
start getting new_q
get new_q:  0.06717801094055176
train start:
critic loss [0.038675133, 0.020987678]
critic loss [0.037911281, 0.02112551]
critic loss [0.03640129, 0.021129997]
critic loss [0.034363359, 0.021008724]
critic loss [0.03205923, 0.020806411]
actor loss -163.457
train end 0.5505259037017822
should be 1.0, predicted: [ 0.87888879], target predicted: [ 0.84694546]
should be -1.0, predicted: [-0.84722942], target predicted: [-0.84573066]
should be -1.0, predicted: [-1.060709], target predicted: [-0.73371774]
should be 1.0, predicted: [ 0.77968276], tar

critic loss [0.032229282, 0.025203599]
actor loss 134.813
train end 0.559802770614624
should be 1.0, predicted: [ 1.1068573], target predicted: [ 1.09848857]
should be 1.0, predicted: [ 0.71590227], target predicted: [ 0.57079577]
Episode : 749 Replay Buffer 50000
TOTAL REWARD @ 749-th Episode  : Reward -1
Total Step: 7
start searching new action
get new action:  0.25017738342285156
start getting new_q
get new_q:  0.07360196113586426
train start:
critic loss [0.030139193, 0.030134514]
critic loss [0.029744538, 0.029731954]
critic loss [0.02953475, 0.029461253]
critic loss [0.029447272, 0.029283363]
critic loss [0.029414648, 0.02915344]
actor loss -132.713
train end 0.6367199420928955
should be 1.0, predicted: [ 1.048684], target predicted: [ 1.00128317]
start searching new action
get new action:  0.23014187812805176
start getting new_q
get new_q:  0.07216286659240723
train start:
critic loss [0.036980622, 0.024496518]
critic loss [0.035928868, 0.024465729]
critic loss [0.033532321, 0.0

get new action:  0.2385392189025879
start getting new_q
get new_q:  0.0696713924407959
train start:
critic loss [0.040798694, 0.020524258]
critic loss [0.039844826, 0.020538818]
critic loss [0.038237095, 0.020450305]
critic loss [0.036161393, 0.020282464]
critic loss [0.033758629, 0.020030485]
actor loss -68.9829
train end 0.5259373188018799
should be -1.0, predicted: [-0.74714661], target predicted: [-0.32159579]
should be 1.0, predicted: [ 0.53380919], target predicted: [ 0.66375726]
start searching new action
get new action:  0.2357630729675293
start getting new_q
get new_q:  0.06592750549316406
train start:
critic loss [0.029967509, 0.029319555]
critic loss [0.030166818, 0.029572312]
critic loss [0.030033756, 0.029521979]
critic loss [0.029605927, 0.029194389]
critic loss [0.02894545, 0.028638981]
actor loss -301.175
train end 0.534592866897583
should be 1.0, predicted: [ 0.89755148], target predicted: [ 0.91705114]
start searching new action
get new action:  0.23738765716552734
st

start searching new action
get new action:  0.22942352294921875
start getting new_q
get new_q:  0.06476712226867676
train start:
critic loss [0.015300804, 0.01527491]
critic loss [0.015535595, 0.015535534]
critic loss [0.015787713, 0.015774837]
critic loss [0.015996717, 0.015955716]
critic loss [0.016119719, 0.016050316]
actor loss -266.318
train end 0.535733699798584
should be 1.0, predicted: [ 0.86833906], target predicted: [ 0.79963881]
start searching new action
get new action:  0.22319602966308594
start getting new_q
get new_q:  0.06416916847229004
train start:
critic loss [0.07558953, 0.026715837]
critic loss [0.06529884, 0.025877057]
critic loss [0.051833779, 0.024722386]
critic loss [0.039310493, 0.023699939]
critic loss [0.030168906, 0.023057174]
actor loss -222.28
train end 0.9040572643280029
should be 1.0, predicted: [ 0.81002218], target predicted: [ 0.44860873]
should be 1.0, predicted: [ 1.13143861], target predicted: [ 0.96518451]
should be -1.0, predicted: [-1.03038239]

critic loss [0.017790679, 0.017790679]
actor loss -204.051
train end 0.5490860939025879
start searching new action
get new action:  0.24022388458251953
start getting new_q
get new_q:  0.0679922103881836
train start:
critic loss [0.060620796, 0.034863696]
critic loss [0.058584236, 0.034292161]
critic loss [0.055140335, 0.033370174]
critic loss [0.050819412, 0.032227173]
critic loss [0.046020433, 0.030967107]
actor loss -438.889
train end 0.5170021057128906
should be -1.0, predicted: [-0.42291436], target predicted: [-0.40975881]
should be -1.0, predicted: [-0.93413854], target predicted: [-1.10351872]
start searching new action
get new action:  0.2435164451599121
start getting new_q
get new_q:  0.06832480430603027
train start:
critic loss [0.020207116, 0.016177636]
critic loss [0.020451732, 0.016373999]
critic loss [0.020572644, 0.016542014]
critic loss [0.020516189, 0.016655775]
critic loss [0.020263344, 0.016695308]
actor loss -206.117
train end 0.6525373458862305
should be -1.0, pred

get new action:  0.23372840881347656
start getting new_q
get new_q:  0.076416015625
train start:
critic loss [0.056583717, 0.018656326]
critic loss [0.050251119, 0.01798729]
critic loss [0.040473051, 0.016842872]
critic loss [0.030338496, 0.015603445]
critic loss [0.02204171, 0.014551636]
actor loss -100.201
train end 0.5501961708068848
should be 1.0, predicted: [ 0.40973121], target predicted: [ 0.23611082]
start searching new action
get new action:  0.2399735450744629
start getting new_q
get new_q:  0.06275129318237305
train start:
critic loss [0.028187558, 0.027823042]
critic loss [0.029092379, 0.028588276]
critic loss [0.029603779, 0.028996386]
critic loss [0.029705774, 0.029051006]
critic loss [0.029426394, 0.028784322]
actor loss -389.185
train end 0.5387771129608154
should be -1.0, predicted: [-0.87059259], target predicted: [-0.88414139]
start searching new action
get new action:  0.22850561141967773
start getting new_q
get new_q:  0.0663602352142334
train start:
critic loss [0

Episode : 758 Replay Buffer 50000
TOTAL REWARD @ 758-th Episode  : Reward -1
Total Step: 123
start searching new action
get new action:  0.2450404167175293
start getting new_q
get new_q:  0.07048225402832031
train start:
critic loss [0.031856775, 0.021024091]
critic loss [0.027656745, 0.019832194]
critic loss [0.024018921, 0.018640492]
critic loss [0.020910608, 0.017491631]
critic loss [0.018532727, 0.016450558]
actor loss -284.408
train end 0.5798921585083008
should be -1.0, predicted: [-0.64429557], target predicted: [-0.54794168]
should be -1.0, predicted: [-1.1354506], target predicted: [-1.07216918]
start searching new action
get new action:  0.24735283851623535
start getting new_q
get new_q:  0.06691932678222656
train start:
critic loss [0.20830691, 0.040436298]
critic loss [0.20212263, 0.039466124]
critic loss [0.18500784, 0.037115101]
critic loss [0.16118771, 0.034024917]
critic loss [0.13473645, 0.030844603]
actor loss 95.8271
train end 0.6156213283538818
should be -1.0, predi

critic loss [0.055131137, 0.035275333]
actor loss -203.439
train end 0.526315450668335
should be 1.0, predicted: [ 1.19246578], target predicted: [ 0.88942921]
should be 1.0, predicted: [ 1.28200138], target predicted: [ 0.96167916]
should be 1.0, predicted: [ 0.84380466], target predicted: [ 0.63674378]
should be 1.0, predicted: [ 0.90539497], target predicted: [ 0.64653915]
should be 1.0, predicted: [ 1.293033], target predicted: [ 0.99183965]
should be -1.0, predicted: [-0.86782372], target predicted: [-0.92553771]
trained action prob map predicted by initial model for a starting game
[[[  1.00000000e+00   4.05405798e-13   4.09974978e-13   4.08746008e-13
     4.05132145e-13   4.05310713e-13   4.05160741e-13]
  [  4.11982487e-13   4.08148477e-13   4.05916457e-13   4.05488550e-13
     4.05085037e-13   4.05061076e-13   4.05358635e-13]
  [  4.11086096e-13   4.07347414e-13   4.11337061e-13   4.05230320e-13
     4.05302202e-13   4.05439815e-13   4.05085037e-13]
  [  4.07350531e-13   4.078

get new action:  0.25086092948913574
start getting new_q
get new_q:  0.07050037384033203
train start:
critic loss [0.04290963, 0.028688781]
critic loss [0.042070113, 0.028297629]
critic loss [0.04061421, 0.027797185]
critic loss [0.038745608, 0.027233237]
critic loss [0.036676921, 0.026658237]
actor loss -120.038
train end 0.6115689277648926
should be 1.0, predicted: [ 1.11549807], target predicted: [ 1.00773752]
should be 1.0, predicted: [ 0.66839325], target predicted: [ 0.59093833]
should be 1.0, predicted: [ 0.99196106], target predicted: [ 0.83945358]
start searching new action
get new action:  0.23308753967285156
start getting new_q
get new_q:  0.06331562995910645
train start:
critic loss [0.056434177, 0.01830928]
critic loss [0.051086538, 0.017715506]
critic loss [0.042692997, 0.016760085]
critic loss [0.033346932, 0.015699046]
critic loss [0.025282193, 0.014785209]
actor loss -31.3993
train end 0.623176097869873
should be 1.0, predicted: [ 0.81538743], target predicted: [ 0.909

critic loss [0.018619642, 0.015449872]
actor loss -331.672
train end 0.5042154788970947
should be 1.0, predicted: [ 0.78484571], target predicted: [ 0.88745272]
should be 1.0, predicted: [ 0.99098551], target predicted: [ 1.15339971]
start searching new action
get new action:  0.23978400230407715
start getting new_q
get new_q:  0.06188607215881348
train start:
critic loss [0.040240668, 0.027916428]
critic loss [0.039286986, 0.027693497]
critic loss [0.037679546, 0.027326511]
critic loss [0.035634633, 0.026851282]
critic loss [0.033375662, 0.026309408]
actor loss -591.201
train end 0.5177597999572754
should be 1.0, predicted: [ 0.51360643], target predicted: [ 0.75000882]
start searching new action
get new action:  0.24759626388549805
start getting new_q
get new_q:  0.0645132064819336
train start:
critic loss [0.048480168, 0.021891698]
critic loss [0.045334481, 0.021331549]
critic loss [0.040699337, 0.020593243]
critic loss [0.035540711, 0.019795325]
critic loss [0.030661169, 0.01903658

get new action:  0.24715113639831543
start getting new_q
get new_q:  0.06503677368164062
train start:
critic loss [0.021221027, 0.021221027]
critic loss [0.020851681, 0.020851681]
critic loss [0.020497177, 0.020497177]
critic loss [0.020160792, 0.020160792]
critic loss [0.019842312, 0.019842312]
actor loss -404.13
train end 0.5591640472412109
start searching new action
get new action:  0.2577931880950928
start getting new_q
get new_q:  0.08160209655761719
train start:
critic loss [0.016608953, 0.016200088]
critic loss [0.016174104, 0.015885266]
critic loss [0.015691502, 0.015515512]
critic loss [0.015202204, 0.015116109]
critic loss [0.014732448, 0.014704932]
actor loss -173.165
train end 0.6377193927764893
should be 1.0, predicted: [ 0.97606349], target predicted: [ 0.33709493]
start searching new action
get new action:  0.2650468349456787
start getting new_q
get new_q:  0.07530665397644043
train start:
critic loss [0.018019546, 0.017852176]
critic loss [0.018074738, 0.017896559]
crit

get new action:  0.22973275184631348
start getting new_q
get new_q:  0.06714940071105957
train start:
critic loss [0.030705685, 0.029213794]
critic loss [0.030342754, 0.028750869]
critic loss [0.029781301, 0.028192356]
critic loss [0.029060744, 0.027571142]
critic loss [0.028237654, 0.026920423]
actor loss -783.979
train end 0.5410854816436768
should be -1.0, predicted: [-0.97443032], target predicted: [-0.91017705]
should be -1.0, predicted: [-0.94084424], target predicted: [-0.98360831]
start searching new action
get new action:  0.2500154972076416
start getting new_q
get new_q:  0.07284331321716309
train start:
critic loss [0.060323518, 0.02556476]
critic loss [0.057676557, 0.025205705]
critic loss [0.05205556, 0.024556551]
critic loss [0.04467269, 0.023726515]
critic loss [0.0368101, 0.022821039]
actor loss -270.882
train end 0.6038599014282227
should be 1.0, predicted: [ 0.29538175], target predicted: [ 0.30164325]
start searching new action
get new action:  0.2499697208404541
sta

critic loss [0.019559741, 0.014217151]
critic loss [0.017370617, 0.013753718]
critic loss [0.015363568, 0.01332884]
critic loss [0.01384646, 0.012983627]
actor loss -143.395
train end 0.5890498161315918
should be 1.0, predicted: [ 0.75776011], target predicted: [-0.08422883]
should be -1.0, predicted: [-1.04799581], target predicted: [-1.05065954]
should be 1.0, predicted: [ 1.04091501], target predicted: [ 0.94021058]
start searching new action
get new action:  0.23887944221496582
start getting new_q
get new_q:  0.07273435592651367
train start:
critic loss [0.015306816, 0.012053744]
critic loss [0.01488267, 0.011946219]
critic loss [0.014473911, 0.011868467]
critic loss [0.014093205, 0.011805298]
critic loss [0.013747322, 0.011742242]
actor loss 143.301
train end 0.5530023574829102
should be 1.0, predicted: [ 1.30548859], target predicted: [ 0.99929458]
should be 1.0, predicted: [ 1.10381985], target predicted: [ 0.74151057]
should be 1.0, predicted: [ 1.06010842], target predicted: [

get new action:  0.23121905326843262
start getting new_q
get new_q:  0.0649714469909668
train start:
critic loss [0.01960095, 0.017011318]
critic loss [0.019422278, 0.017214241]
critic loss [0.018992305, 0.01721872]
critic loss [0.018354563, 0.017021276]
critic loss [0.017576501, 0.016644161]
actor loss -82.7547
train end 0.5600564479827881
should be 1.0, predicted: [ 1.16563857], target predicted: [ 0.80576807]
should be 1.0, predicted: [ 1.11582756], target predicted: [ 0.79989874]
should be 1.0, predicted: [ 0.94563991], target predicted: [ 0.97781163]
should be -1.0, predicted: [-1.06375837], target predicted: [-1.10016918]
should be 1.0, predicted: [ 0.94213045], target predicted: [ 1.06304407]
should be 1.0, predicted: [ 1.00053406], target predicted: [ 1.08265853]
start searching new action
get new action:  0.2851226329803467
start getting new_q
get new_q:  0.09460639953613281
train start:
critic loss [0.026800774, 0.026764845]
critic loss [0.026165577, 0.026142284]
critic loss 

critic loss [0.019532129, 0.015420351]
critic loss [0.018987257, 0.015171828]
critic loss [0.018126978, 0.014861709]
critic loss [0.017110839, 0.014529917]
actor loss -669.338
train end 0.5269019603729248
should be 1.0, predicted: [ 0.49820292], target predicted: [ 0.8074621]
should be -1.0, predicted: [-1.17837214], target predicted: [-1.01665127]
should be -1.0, predicted: [-1.25244415], target predicted: [-1.04026234]
start searching new action
get new action:  0.23940610885620117
start getting new_q
get new_q:  0.07406830787658691
train start:
critic loss [0.027976818, 0.025911499]
critic loss [0.027085932, 0.025170784]
critic loss [0.026146701, 0.024420673]
critic loss [0.025212202, 0.023704175]
critic loss [0.024307786, 0.023042452]
actor loss -156.565
train end 0.676903247833252
should be -1.0, predicted: [-0.77471954], target predicted: [-0.03519965]
start searching new action
get new action:  0.29163503646850586
start getting new_q
get new_q:  0.10724353790283203
train start:


get new action:  0.23054814338684082
start getting new_q
get new_q:  0.06738638877868652
train start:
critic loss [0.042632096, 0.01959856]
critic loss [0.0420954, 0.019701384]
critic loss [0.03946989, 0.019396443]
critic loss [0.035271309, 0.018753296]
critic loss [0.030418934, 0.017932201]
actor loss 75.6372
train end 0.6823394298553467
should be 1.0, predicted: [ 0.99220181], target predicted: [ 1.0632292]
should be 1.0, predicted: [ 0.87825859], target predicted: [-0.38115564]
should be 1.0, predicted: [ 0.76993418], target predicted: [ 0.7662586]
start searching new action
get new action:  0.2474231719970703
start getting new_q
get new_q:  0.07158780097961426
train start:
critic loss [0.037887659, 0.01957228]
critic loss [0.035644419, 0.019531064]
critic loss [0.033050999, 0.019533511]
critic loss [0.030429451, 0.019588016]
critic loss [0.028024305, 0.019667355]
actor loss 222.591
train end 0.5759227275848389
should be 1.0, predicted: [ 0.93986201], target predicted: [ 0.05975407]

critic loss [0.030151911, 0.029478502]
critic loss [0.02951665, 0.02880263]
critic loss [0.028573669, 0.027867811]
critic loss [0.027410008, 0.02674601]
actor loss -414.892
train end 0.7069969177246094
should be -1.0, predicted: [-0.89629644], target predicted: [-0.78196073]
should be 1.0, predicted: [ 0.87710357], target predicted: [ 0.95061338]
start searching new action
get new action:  0.25393199920654297
start getting new_q
get new_q:  0.06462621688842773
train start:
critic loss [0.025970636, 0.025949808]
critic loss [0.025336957, 0.025273748]
critic loss [0.024611289, 0.024494767]
critic loss [0.02384332, 0.023673316]
critic loss [0.023077134, 0.022861756]
actor loss -214.074
train end 0.5697300434112549
should be 1.0, predicted: [ 0.88560337], target predicted: [ 0.86457318]
start searching new action
get new action:  0.23819565773010254
start getting new_q
get new_q:  0.06465888023376465
train start:
critic loss [0.024221741, 0.023632227]
critic loss [0.024064187, 0.023409694]

critic loss [0.054186419, 0.022020582]
actor loss -374.872
train end 0.5625655651092529
should be 1.0, predicted: [ 0.28783694], target predicted: [ 0.10017212]
should be 1.0, predicted: [ 1.19016862], target predicted: [ 1.05216992]
should be 1.0, predicted: [ 0.67070931], target predicted: [ 0.74076474]
start searching new action
get new action:  0.28102970123291016
start getting new_q
get new_q:  0.08287954330444336
train start:
critic loss [0.10857726, 0.023625392]
critic loss [0.10187405, 0.022886764]
critic loss [0.091049984, 0.021775194]
critic loss [0.077229366, 0.020445503]
critic loss [0.06295003, 0.019179197]
actor loss -199.212
train end 0.6167161464691162
should be 1.0, predicted: [ 1.26571369], target predicted: [ 1.14730322]
should be 1.0, predicted: [ 1.0073024], target predicted: [ 0.94439954]
should be 1.0, predicted: [ 1.1296407], target predicted: [ 0.99049628]
should be -1.0, predicted: [-0.69879133], target predicted: [-0.39244077]
should be 1.0, predicted: [ 0.30

Episode : 780 Replay Buffer 50000
TOTAL REWARD @ 780-th Episode  : Reward 1
Total Step: 112
start searching new action
get new action:  0.27324509620666504
start getting new_q
get new_q:  0.06683158874511719
train start:
critic loss [0.017672399, 0.017447805]
critic loss [0.017487561, 0.017333375]
critic loss [0.017271571, 0.017175727]
critic loss [0.017017622, 0.01696079]
critic loss [0.016713431, 0.016676333]
actor loss -158.887
train end 0.6154227256774902
should be 1.0, predicted: [ 0.87939358], target predicted: [ 0.68378448]
should be 1.0, predicted: [ 0.58738649], target predicted: [ 0.58218026]
start searching new action
get new action:  0.21842241287231445
start getting new_q
get new_q:  0.06289935111999512
train start:
critic loss [0.13413517, 0.035759702]
critic loss [0.12165073, 0.034365505]
critic loss [0.10185913, 0.032115903]
critic loss [0.08240556, 0.029791392]
critic loss [0.065938324, 0.027741518]
actor loss -696.899
train end 0.5609736442565918
should be -1.0, predi

train end 0.5616941452026367
should be -1.0, predicted: [-0.73472941], target predicted: [-0.87109095]
should be 1.0, predicted: [ 0.82250053], target predicted: [ 0.85143977]
should be 1.0, predicted: [ 1.01693022], target predicted: [ 0.97727972]
Episode : 782 Replay Buffer 50000
TOTAL REWARD @ 782-th Episode  : Reward 1
Total Step: 119
start searching new action
get new action:  0.23338031768798828
start getting new_q
get new_q:  0.0848531723022461
train start:
critic loss [0.042251606, 0.042129699]
critic loss [0.041362524, 0.041231707]
critic loss [0.040765785, 0.040636163]
critic loss [0.04037945, 0.040259987]
critic loss [0.040113464, 0.040011052]
actor loss -291.332
train end 0.6396698951721191
should be 1.0, predicted: [ 1.09123075], target predicted: [ 1.06014538]
start searching new action
get new action:  0.24611830711364746
start getting new_q
get new_q:  0.08142352104187012
train start:
critic loss [0.023607165, 0.022892879]
critic loss [0.023936348, 0.023347402]
critic l

Episode : 784 Replay Buffer 50000
TOTAL REWARD @ 784-th Episode  : Reward -1
Total Step: 159
start searching new action
get new action:  0.2326974868774414
start getting new_q
get new_q:  0.06489968299865723
train start:
critic loss [0.030849256, 0.019980095]
critic loss [0.029436182, 0.019582119]
critic loss [0.027311761, 0.01912079]
critic loss [0.024833936, 0.018622773]
critic loss [0.022303581, 0.018118823]
actor loss -364.95
train end 0.5374233722686768
should be 1.0, predicted: [ 0.70186234], target predicted: [ 0.36270437]
should be 1.0, predicted: [ 1.03890586], target predicted: [ 0.94697046]
start searching new action
get new action:  0.23087525367736816
start getting new_q
get new_q:  0.0667569637298584
train start:
critic loss [0.041135423, 0.016773157]
critic loss [0.039959714, 0.016670432]
critic loss [0.037430227, 0.016423939]
critic loss [0.034051251, 0.016120099]
critic loss [0.030284736, 0.015843172]
actor loss -461.101
train end 0.5219986438751221
should be 1.0, pred

critic loss [0.069639437, 0.022813093]
critic loss [0.065743476, 0.022433631]
critic loss [0.060622573, 0.02201945]
critic loss [0.054817915, 0.021620039]
actor loss -290.597
train end 0.6383061408996582
should be 1.0, predicted: [ 0.22181496], target predicted: [ 0.13471158]
should be 1.0, predicted: [ 1.1978451], target predicted: [ 0.91466874]
should be 1.0, predicted: [ 1.10622787], target predicted: [ 0.86227304]
should be 1.0, predicted: [ 1.21288455], target predicted: [ 1.1125778]
start searching new action
get new action:  0.23292803764343262
start getting new_q
get new_q:  0.07582736015319824
train start:
critic loss [0.035849858, 0.024321133]
critic loss [0.036368102, 0.024682708]
critic loss [0.036204468, 0.024735831]
critic loss [0.035429865, 0.024501409]
critic loss [0.03415696, 0.02403779]
actor loss -476.292
train end 0.7834668159484863
should be -1.0, predicted: [-0.67748493], target predicted: [-0.64113587]
should be 1.0, predicted: [ 1.02587116], target predicted: [ 

Episode : 788 Replay Buffer 50000
TOTAL REWARD @ 788-th Episode  : Reward 1
Total Step: 140
start searching new action
get new action:  0.24847126007080078
start getting new_q
get new_q:  0.06818890571594238
train start:
critic loss [0.090011947, 0.024123147]
critic loss [0.083012082, 0.023464374]
critic loss [0.073531657, 0.02288416]
critic loss [0.062865257, 0.022518523]
critic loss [0.052317325, 0.022452297]
actor loss -458.229
train end 0.619572639465332
should be 1.0, predicted: [ 1.12756288], target predicted: [ 0.9631623]
should be 1.0, predicted: [ 1.01510477], target predicted: [ 0.98384911]
should be 1.0, predicted: [ 0.87729806], target predicted: [ 0.78995645]
should be 1.0, predicted: [ 0.09006536], target predicted: [-0.16900884]
start searching new action
get new action:  0.2455732822418213
start getting new_q
get new_q:  0.0787971019744873
train start:
critic loss [0.11784774, 0.032281097]
critic loss [0.1115338, 0.033242445]
critic loss [0.10064559, 0.033570405]
critic

get new action:  0.2355027198791504
start getting new_q
get new_q:  0.07365083694458008
train start:
critic loss [0.015889712, 0.015889712]
critic loss [0.015654948, 0.015654948]
critic loss [0.015464945, 0.015464945]
critic loss [0.015311793, 0.015311793]
critic loss [0.015183942, 0.015183942]
actor loss -200.861
train end 0.5473253726959229
start searching new action
get new action:  0.23317408561706543
start getting new_q
get new_q:  0.0671839714050293
train start:
critic loss [0.061656781, 0.026502142]
critic loss [0.059184879, 0.026431901]
critic loss [0.05489739, 0.026029509]
critic loss [0.049441919, 0.025347751]
critic loss [0.043315712, 0.024454141]
actor loss -204.243
train end 0.5220510959625244
should be -1.0, predicted: [-0.6313529], target predicted: [-0.70593017]
should be 1.0, predicted: [ 1.24889278], target predicted: [ 0.94027758]
should be 1.0, predicted: [ 0.65646005], target predicted: [-0.12210345]
trained action prob map predicted by initial model for a starting

critic loss [0.020717755, 0.020717755]
critic loss [0.020329457, 0.020329457]
critic loss [0.01982774, 0.01982774]
critic loss [0.019258969, 0.019258969]
actor loss -352.328
train end 0.5889067649841309
start searching new action
get new action:  0.23544597625732422
start getting new_q
get new_q:  0.06978726387023926
train start:
critic loss [0.064232439, 0.024528032]
critic loss [0.058376603, 0.023859188]
critic loss [0.048735783, 0.022743184]
critic loss [0.03831837, 0.021491822]
critic loss [0.02918089, 0.020307556]
actor loss -520.706
train end 0.5261256694793701
should be 1.0, predicted: [ 0.47816935], target predicted: [ 0.42358229]
should be -1.0, predicted: [-0.83569229], target predicted: [-1.04171062]
should be -1.0, predicted: [-0.76614374], target predicted: [-0.80508208]
start searching new action
get new action:  0.2421717643737793
start getting new_q
get new_q:  0.06845569610595703
train start:
critic loss [0.015764423, 0.015764423]
critic loss [0.015795773, 0.015795773]

critic loss [0.03603664, 0.025244929]
critic loss [0.035413921, 0.024788607]
critic loss [0.033989891, 0.024239842]
critic loss [0.032007691, 0.023632064]
actor loss -384.142
train end 0.6723754405975342
should be 1.0, predicted: [ 0.73223609], target predicted: [ 0.86508834]
should be 1.0, predicted: [ 0.77735531], target predicted: [ 0.85565364]
should be 1.0, predicted: [ 0.88776803], target predicted: [ 1.0290997]
should be -1.0, predicted: [-1.30090415], target predicted: [-1.08787858]
should be 1.0, predicted: [ 1.0537194], target predicted: [ 0.96258497]
should be 1.0, predicted: [ 1.01082933], target predicted: [ 1.06913924]
start searching new action
get new action:  0.24267840385437012
start getting new_q
get new_q:  0.07623600959777832
train start:
critic loss [0.01949512, 0.018842036]
critic loss [0.01929941, 0.018784393]
critic loss [0.019091919, 0.018713633]
critic loss [0.018897725, 0.018635456]
critic loss [0.018722795, 0.018546909]
actor loss -331.898
train end 0.58295

critic loss [0.034387287, 0.023336347]
critic loss [0.032457814, 0.022808405]
critic loss [0.030140631, 0.022156285]
critic loss [0.027645066, 0.021450216]
actor loss -312.005
train end 0.5610384941101074
should be 1.0, predicted: [ 1.01359499], target predicted: [ 0.85321587]
should be -1.0, predicted: [-0.8033579], target predicted: [-0.80723929]
should be 1.0, predicted: [ 0.69014686], target predicted: [-0.04461442]
should be -1.0, predicted: [-0.96063131], target predicted: [-0.99614495]
Episode : 795 Replay Buffer 50000
TOTAL REWARD @ 795-th Episode  : Reward -1
Total Step: 115
start searching new action
get new action:  0.25804829597473145
start getting new_q
get new_q:  0.06994199752807617
train start:
critic loss [0.022315841, 0.021594107]
critic loss [0.022448197, 0.021754365]
critic loss [0.022416007, 0.021771731]
critic loss [0.022203688, 0.021625636]
critic loss [0.021826774, 0.021325693]
actor loss -140.382
train end 0.5144336223602295
should be 1.0, predicted: [ 0.975300

critic loss [0.029386921, 0.027824029]
critic loss [0.029106645, 0.027499307]
critic loss [0.028718602, 0.027106848]
critic loss [0.028245544, 0.026671542]
actor loss -722.728
train end 0.9149160385131836
should be 1.0, predicted: [ 0.77392429], target predicted: [ 0.95721024]
should be -1.0, predicted: [-0.92081362], target predicted: [-0.89231533]
start searching new action
get new action:  0.2769334316253662
start getting new_q
get new_q:  0.0718686580657959
train start:
critic loss [0.027759757, 0.024993172]
critic loss [0.02710381, 0.024534203]
critic loss [0.026206771, 0.024003629]
critic loss [0.025190858, 0.023443449]
critic loss [0.024185689, 0.022887275]
actor loss -142.232
train end 0.7026968002319336
should be 1.0, predicted: [ 0.90222031], target predicted: [ 0.57985681]
should be -1.0, predicted: [-0.80749613], target predicted: [-0.86830491]
start searching new action
get new action:  0.22823023796081543
start getting new_q
get new_q:  0.06420063972473145
train start:
cr

get new action:  0.24651885032653809
start getting new_q
get new_q:  0.06806230545043945
train start:
critic loss [0.022210699, 0.021827783]
critic loss [0.02204873, 0.021674078]
critic loss [0.021786448, 0.021373775]
critic loss [0.021424156, 0.02095297]
critic loss [0.020960117, 0.020438319]
actor loss -316.067
train end 0.6054754257202148
should be 1.0, predicted: [ 0.87352026], target predicted: [ 0.86333644]
should be 1.0, predicted: [ 0.92249727], target predicted: [ 0.96980196]
should be 1.0, predicted: [ 0.9303183], target predicted: [ 1.10718656]
should be 1.0, predicted: [ 0.81820583], target predicted: [ 0.95480382]
start searching new action
get new action:  0.25784730911254883
start getting new_q
get new_q:  0.06351542472839355
train start:
critic loss [0.050719153, 0.019760357]
critic loss [0.046003513, 0.019149292]
critic loss [0.038299657, 0.018229041]
critic loss [0.030012606, 0.017249748]
critic loss [0.023136001, 0.016419299]
actor loss -64.6501
train end 0.590725898

Episode : 800 Replay Buffer 50000
TOTAL REWARD @ 800-th Episode  : Reward 1
Total Step: 108
start searching new action
get new action:  0.25294017791748047
start getting new_q
get new_q:  0.06431746482849121
train start:
critic loss [0.032686755, 0.029421084]
critic loss [0.032305092, 0.029110931]
critic loss [0.031107862, 0.028369151]
critic loss [0.029422559, 0.027373824]
critic loss [0.027593052, 0.02627651]
actor loss -837.87
train end 0.5441489219665527
should be -1.0, predicted: [-0.99903071], target predicted: [-1.00027919]
should be 1.0, predicted: [ 1.00428104], target predicted: [ 1.01253319]
should be 1.0, predicted: [ 0.96545136], target predicted: [ 0.99275041]
start searching new action
get new action:  0.26372742652893066
start getting new_q
get new_q:  0.06863284111022949
train start:
critic loss [0.03051278, 0.029134948]
critic loss [0.029590774, 0.028329059]
critic loss [0.028529974, 0.027574262]
critic loss [0.027441183, 0.026858501]
critic loss [0.026485419, 0.02618

critic loss [0.028321411, 0.017929174]
actor loss 1.53089
train end 0.5107505321502686
should be 1.0, predicted: [ 0.71201789], target predicted: [ 0.49723437]
should be 1.0, predicted: [ 1.29069924], target predicted: [ 1.09770167]
should be -1.0, predicted: [-0.79996431], target predicted: [-0.65422112]
Episode : 802 Replay Buffer 50000
TOTAL REWARD @ 802-th Episode  : Reward -1
Total Step: 123
start searching new action
get new action:  0.2434983253479004
start getting new_q
get new_q:  0.07052230834960938
train start:
critic loss [0.029258082, 0.019685762]
critic loss [0.028488452, 0.019901546]
critic loss [0.026870552, 0.019980848]
critic loss [0.024956506, 0.019930359]
critic loss [0.023250634, 0.019775603]
actor loss -398.808
train end 0.6301891803741455
should be 1.0, predicted: [ 0.54475844], target predicted: [ 0.85923105]
should be 1.0, predicted: [ 0.55472624], target predicted: [-0.23567733]
should be -1.0, predicted: [-1.09452939], target predicted: [-1.00520492]
start se

critic loss [0.032627165, 0.024050809]
critic loss [0.029889988, 0.023022387]
critic loss [0.026832726, 0.021861203]
critic loss [0.023881529, 0.020716287]
actor loss -205.138
train end 0.8062021732330322
should be 1.0, predicted: [ 0.73906296], target predicted: [ 1.00043631]
should be 1.0, predicted: [ 0.70116562], target predicted: [ 0.84896016]
Episode : 804 Replay Buffer 50000
TOTAL REWARD @ 804-th Episode  : Reward 1
Total Step: 141
start searching new action
get new action:  0.24155521392822266
start getting new_q
get new_q:  0.06942033767700195
train start:
critic loss [0.027469415, 0.016946107]
critic loss [0.023467362, 0.016356934]
critic loss [0.019616913, 0.015900176]
critic loss [0.017309871, 0.01570892]
critic loss [0.016340004, 0.015718404]
actor loss -312.794
train end 0.6778173446655273
should be 1.0, predicted: [ 1.0982672], target predicted: [ 0.97605288]
should be 1.0, predicted: [ 0.93487144], target predicted: [ 0.68627948]
start searching new action
get new actio

critic loss [0.036740649, 0.036172319]
actor loss -532.193
train end 0.5369715690612793
should be 1.0, predicted: [ 0.81880277], target predicted: [ 0.82669938]
start searching new action
get new action:  0.23847746849060059
start getting new_q
get new_q:  0.06350016593933105
train start:
critic loss [0.021325549, 0.020835433]
critic loss [0.021491308, 0.02122812]
critic loss [0.021583587, 0.02148081]
critic loss [0.021587018, 0.021568218]
critic loss [0.021475861, 0.02147517]
actor loss 61.6486
train end 0.5573532581329346
should be -1.0, predicted: [-0.86723453], target predicted: [-1.0355233]
Episode : 806 Replay Buffer 50000
TOTAL REWARD @ 806-th Episode  : Reward 1
Total Step: 124
start searching new action
get new action:  0.3256504535675049
start getting new_q
get new_q:  0.08932852745056152
train start:
critic loss [0.037342183, 0.02926429]
critic loss [0.036270067, 0.028588783]
critic loss [0.033625696, 0.027293101]
critic loss [0.030134842, 0.025643006]
critic loss [0.0266065

critic loss [0.027106058, 0.026907291]
critic loss [0.026854828, 0.026768081]
critic loss [0.026596673, 0.026561003]
critic loss [0.026313331, 0.026280444]
actor loss -751.626
train end 0.5235428810119629
should be -1.0, predicted: [-0.90897202], target predicted: [-1.11353564]
should be 1.0, predicted: [ 1.09666455], target predicted: [ 1.14711678]
start searching new action
get new action:  0.22342395782470703
start getting new_q
get new_q:  0.07372665405273438
train start:
critic loss [0.030956253, 0.022659067]
critic loss [0.030406233, 0.022666981]
critic loss [0.029087812, 0.022443786]
critic loss [0.027290748, 0.022052538]
critic loss [0.025302254, 0.021560237]
actor loss 78.0033
train end 0.5370430946350098
should be 1.0, predicted: [ 0.77232397], target predicted: [ 0.59691882]
Episode : 808 Replay Buffer 50000
TOTAL REWARD @ 808-th Episode  : Reward 1
Total Step: 126
start searching new action
get new action:  0.23955988883972168
start getting new_q
get new_q:  0.0665590763092

get new action:  0.23331141471862793
start getting new_q
get new_q:  0.06803631782531738
train start:
critic loss [0.037202433, 0.02701072]
critic loss [0.036620282, 0.026967751]
critic loss [0.034606267, 0.026553214]
critic loss [0.031772237, 0.025888748]
critic loss [0.028803155, 0.025123434]
actor loss 105.31
train end 0.5427007675170898
should be -1.0, predicted: [-1.27230191], target predicted: [-1.0127238]
should be -1.0, predicted: [-1.22574735], target predicted: [-0.9096843]
start searching new action
get new action:  0.23011207580566406
start getting new_q
get new_q:  0.0602574348449707
train start:
critic loss [0.025640441, 0.02393863]
critic loss [0.024695911, 0.023520086]
critic loss [0.023847852, 0.023161609]
critic loss [0.023162581, 0.022855705]
critic loss [0.022669792, 0.022591211]
actor loss -563.642
train end 0.5215492248535156
should be 1.0, predicted: [ 1.36700916], target predicted: [ 1.04095149]
should be 1.0, predicted: [ 1.14098966], target predicted: [ 0.9526

Episode : 811 Replay Buffer 50000
TOTAL REWARD @ 811-th Episode  : Reward 1
Total Step: 110
start searching new action
get new action:  0.25246334075927734
start getting new_q
get new_q:  0.06749844551086426
train start:
critic loss [0.02613179, 0.026018437]
critic loss [0.025930047, 0.025891924]
critic loss [0.025454642, 0.025451105]
critic loss [0.024736142, 0.024732016]
critic loss [0.023830127, 0.023799859]
actor loss -161.543
train end 0.5631639957427979
should be 1.0, predicted: [ 0.88132215], target predicted: [ 0.9208234]
start searching new action
get new action:  0.25160694122314453
start getting new_q
get new_q:  0.07009053230285645
train start:
critic loss [0.021981591, 0.021829281]
critic loss [0.021997849, 0.021721937]
critic loss [0.022011738, 0.021611685]
critic loss [0.021979704, 0.021478277]
critic loss [0.021878742, 0.021312453]
actor loss -507.538
train end 0.7113003730773926
should be -1.0, predicted: [-0.90757227], target predicted: [-0.90081573]
start searching n

critic loss [0.029312965, 0.021666747]
actor loss 159.767
train end 0.590604305267334
should be 1.0, predicted: [ 0.54926664], target predicted: [ 0.4052836]
should be 1.0, predicted: [ 0.92207813], target predicted: [ 0.87835753]
should be 1.0, predicted: [ 1.07072079], target predicted: [ 1.0541482]
should be 1.0, predicted: [ 1.05847299], target predicted: [ 0.78704518]
should be 1.0, predicted: [ 1.17824984], target predicted: [ 0.76334137]
Episode : 813 Replay Buffer 50000
TOTAL REWARD @ 813-th Episode  : Reward -1
Total Step: 127
start searching new action
get new action:  0.2649519443511963
start getting new_q
get new_q:  0.07588529586791992
train start:
critic loss [0.043746896, 0.037393123]
critic loss [0.043677911, 0.037661884]
critic loss [0.043045659, 0.037609115]
critic loss [0.041979413, 0.037271485]
critic loss [0.040598717, 0.036689322]
actor loss -410.551
train end 0.6044988632202148
should be 1.0, predicted: [ 1.15818846], target predicted: [ 0.29711294]
should be -1.

critic loss [0.024117371, 0.024117371]
critic loss [0.024134256, 0.024134256]
critic loss [0.024009801, 0.024009801]
critic loss [0.023751386, 0.023751386]
actor loss -334.79
train end 0.6100223064422607
start searching new action
get new action:  0.24643397331237793
start getting new_q
get new_q:  0.06435012817382812
train start:
critic loss [0.020978607, 0.017641073]
critic loss [0.020343067, 0.017219702]
critic loss [0.019408608, 0.016597673]
critic loss [0.018301971, 0.01585798]
critic loss [0.017152537, 0.015085047]
actor loss 74.2161
train end 0.5747151374816895
should be 1.0, predicted: [ 1.24572384], target predicted: [ 1.00311697]
should be 1.0, predicted: [ 0.95012969], target predicted: [ 0.86036241]
should be -1.0, predicted: [-0.94721013], target predicted: [-0.90662205]
Episode : 815 Replay Buffer 50000
TOTAL REWARD @ 815-th Episode  : Reward -1
Total Step: 151
start searching new action
get new action:  0.266329288482666
start getting new_q
get new_q:  0.0693993568420410

critic loss [0.027024552, 0.027024552]
actor loss -390.416
train end 0.5858938694000244
start searching new action
get new action:  0.26221752166748047
start getting new_q
get new_q:  0.0798485279083252
train start:
critic loss [0.044046421, 0.028713245]
critic loss [0.041687422, 0.027871303]
critic loss [0.037624069, 0.026610138]
critic loss [0.033177264, 0.025198821]
critic loss [0.029352354, 0.02387088]
actor loss -751.508
train end 0.6616687774658203
should be -1.0, predicted: [-1.22628284], target predicted: [-1.10086226]
should be -1.0, predicted: [-0.96019548], target predicted: [-0.55601656]
Episode : 817 Replay Buffer 50000
TOTAL REWARD @ 817-th Episode  : Reward 1
Total Step: 142
start searching new action
get new action:  0.2563633918762207
start getting new_q
get new_q:  0.07227945327758789
train start:
critic loss [0.03430713, 0.028510787]
critic loss [0.033994205, 0.028492175]
critic loss [0.033224583, 0.028396674]
critic loss [0.032084581, 0.028178535]
critic loss [0.030

critic loss [0.025033709, 0.025007769]
actor loss -317.274
train end 0.5060827732086182
should be -1.0, predicted: [-1.05880439], target predicted: [-0.99791169]
start searching new action
get new action:  0.2301785945892334
start getting new_q
get new_q:  0.07318830490112305
train start:
critic loss [0.019847121, 0.018545439]
critic loss [0.019600838, 0.018225156]
critic loss [0.019203229, 0.0178397]
critic loss [0.018702231, 0.017426807]
critic loss [0.018137958, 0.017009348]
actor loss -194.658
train end 0.5530903339385986
should be 1.0, predicted: [ 0.87485015], target predicted: [ 0.94046599]
should be -1.0, predicted: [-1.04721189], target predicted: [-0.86678702]
start searching new action
get new action:  0.2457444667816162
start getting new_q
get new_q:  0.06757903099060059
train start:
critic loss [0.024674945, 0.022159612]
critic loss [0.024309747, 0.02226104]
critic loss [0.023671266, 0.022267137]
critic loss [0.022939272, 0.022168994]
critic loss [0.022284804, 0.021969086]

start searching new action
get new action:  0.27536869049072266
start getting new_q
get new_q:  0.07179093360900879
train start:
critic loss [0.026408961, 0.024604885]
critic loss [0.026857793, 0.024559135]
critic loss [0.027075004, 0.024462394]
critic loss [0.026971972, 0.024265671]
critic loss [0.026540948, 0.023952613]
actor loss -45.0956
train end 0.8048477172851562
should be 1.0, predicted: [ 0.86095899], target predicted: [ 0.94525898]
should be 1.0, predicted: [ 0.84839261], target predicted: [ 0.95943338]
start searching new action
get new action:  0.2275528907775879
start getting new_q
get new_q:  0.06610774993896484
train start:
critic loss [0.0300634, 0.017261837]
critic loss [0.0282973, 0.016962336]
critic loss [0.025665209, 0.016596397]
critic loss [0.022760101, 0.016228925]
critic loss [0.02019657, 0.01592765]
actor loss -265.27
train end 0.6466560363769531
should be 1.0, predicted: [ 0.96538144], target predicted: [ 0.99624491]
should be 1.0, predicted: [ 0.85796195], ta

critic loss [0.023251493, 0.015723575]
critic loss [0.023356652, 0.015661966]
critic loss [0.023012614, 0.015539794]
critic loss [0.022282824, 0.015365846]
critic loss [0.021269444, 0.015154276]
actor loss -306.046
train end 0.7859458923339844
should be 1.0, predicted: [ 0.52059746], target predicted: [ 0.75866002]
should be -1.0, predicted: [-0.90174615], target predicted: [-0.81401622]
should be 1.0, predicted: [ 0.93480039], target predicted: [ 1.06553209]
start searching new action
get new action:  0.2620890140533447
start getting new_q
get new_q:  0.07270598411560059
train start:
critic loss [0.035244163, 0.02467268]
critic loss [0.033980481, 0.024260961]
critic loss [0.031907223, 0.023676405]
critic loss [0.029431533, 0.023002943]
critic loss [0.027201176, 0.02234304]
actor loss -349.702
train end 0.6675784587860107
should be -1.0, predicted: [-1.10806227], target predicted: [-1.07463777]
should be -1.0, predicted: [-1.0636785], target predicted: [-1.06184292]
should be -1.0, pre

critic loss [0.071352527, 0.030681012]
critic loss [0.066259935, 0.029603234]
critic loss [0.059066765, 0.028342457]
critic loss [0.050502367, 0.027080309]
actor loss -383.933
train end 0.6143357753753662
should be 1.0, predicted: [ 0.40168574], target predicted: [ 0.04440781]
should be 1.0, predicted: [ 0.81732053], target predicted: [ 0.88618755]
start searching new action
get new action:  0.24687886238098145
start getting new_q
get new_q:  0.0701150894165039
train start:
critic loss [0.093224123, 0.028064894]
critic loss [0.089690663, 0.028223917]
critic loss [0.084197223, 0.028258407]
critic loss [0.077350572, 0.028158952]
critic loss [0.069636658, 0.027916249]
actor loss -81.9165
train end 0.5609762668609619
should be 1.0, predicted: [ 1.01786149], target predicted: [ 0.98297292]
should be 1.0, predicted: [ 0.9918924], target predicted: [ 0.98705775]
should be 1.0, predicted: [ 1.20309055], target predicted: [ 0.94476271]
should be -1.0, predicted: [-0.5935604], target predicted: 

Episode : 826 Replay Buffer 50000
TOTAL REWARD @ 826-th Episode  : Reward -1
Total Step: 124
start searching new action
get new action:  0.24599170684814453
start getting new_q
get new_q:  0.0642235279083252
train start:
critic loss [0.023065422, 0.023065422]
critic loss [0.022684056, 0.022684056]
critic loss [0.022199564, 0.022199564]
critic loss [0.021646515, 0.021646515]
critic loss [0.021078724, 0.021078724]
actor loss -572.333
train end 0.5422918796539307
start searching new action
get new action:  0.24001002311706543
start getting new_q
get new_q:  0.0682382583618164
train start:
critic loss [0.019360382, 0.013713203]
critic loss [0.018024469, 0.013007948]
critic loss [0.016156778, 0.012225669]
critic loss [0.014161658, 0.011479467]
critic loss [0.012417388, 0.010865377]
actor loss -334.891
train end 0.5231668949127197
should be 1.0, predicted: [ 0.2656697], target predicted: [ 0.43375525]
should be -1.0, predicted: [-1.02003741], target predicted: [-1.12348962]
start searching n

get new action:  0.23657917976379395
start getting new_q
get new_q:  0.07433247566223145
train start:
critic loss [0.029117897, 0.027361225]
critic loss [0.028736813, 0.02712838]
critic loss [0.028111666, 0.026770281]
critic loss [0.027320988, 0.026312547]
critic loss [0.026450135, 0.025783036]
actor loss -387.732
train end 0.5107007026672363
should be -1.0, predicted: [-1.12135541], target predicted: [-1.00028646]
should be 1.0, predicted: [ 1.04892433], target predicted: [ 0.8790372]
Episode : 828 Replay Buffer 50000
TOTAL REWARD @ 828-th Episode  : Reward -1
Total Step: 97
start searching new action
get new action:  0.24282193183898926
start getting new_q
get new_q:  0.06660270690917969
train start:
critic loss [0.030138697, 0.024336807]
critic loss [0.030046459, 0.024132624]
critic loss [0.029673651, 0.023825936]
critic loss [0.029055916, 0.023431886]
critic loss [0.028241741, 0.02297245]
actor loss 34.9094
train end 0.5447173118591309
should be 1.0, predicted: [ 1.05344903], targe

critic loss [0.034354862, 0.023595858]
critic loss [0.032606266, 0.023341347]
critic loss [0.030099832, 0.02301438]
critic loss [0.027424417, 0.022672318]
actor loss -164.556
train end 0.7308375835418701
should be 1.0, predicted: [ 1.08974397], target predicted: [ 1.14714909]
should be 1.0, predicted: [ 0.83249539], target predicted: [ 0.72698885]
start searching new action
get new action:  0.22921395301818848
start getting new_q
get new_q:  0.06435060501098633
train start:
critic loss [0.018960763, 0.018960763]
critic loss [0.018842885, 0.018842885]
critic loss [0.018689081, 0.018689081]
critic loss [0.018508079, 0.018508079]
critic loss [0.018304387, 0.018304387]
actor loss -273.464
train end 0.5968837738037109
start searching new action
get new action:  0.22479867935180664
start getting new_q
get new_q:  0.06382608413696289
train start:
critic loss [0.084558062, 0.024505384]
critic loss [0.080969885, 0.024095373]
critic loss [0.074371226, 0.023330059]
critic loss [0.065290742, 0.022

critic loss [0.019295098, 0.018638371]
actor loss -329.463
train end 0.545950174331665
should be 1.0, predicted: [ 0.84795481], target predicted: [ 1.04094875]
start searching new action
get new action:  0.25893568992614746
start getting new_q
get new_q:  0.06938934326171875
train start:
critic loss [0.014465779, 0.013650453]
critic loss [0.014193247, 0.013510426]
critic loss [0.0137414, 0.013279568]
critic loss [0.013234266, 0.013001695]
critic loss [0.012783684, 0.012717664]
actor loss -167.562
train end 0.674274206161499
should be 1.0, predicted: [ 1.17661059], target predicted: [ 0.84370118]
start searching new action
get new action:  0.24120855331420898
start getting new_q
get new_q:  0.06972789764404297
train start:
critic loss [0.042766105, 0.028270015]
critic loss [0.041602522, 0.028007753]
critic loss [0.039232388, 0.027583819]
critic loss [0.036205541, 0.027072262]
critic loss [0.033114534, 0.026558794]
actor loss -540.194
train end 0.5899128913879395
should be 1.0, predicted

critic loss [0.047112644, 0.029012663]
critic loss [0.044794463, 0.028416507]
critic loss [0.042075522, 0.027862053]
critic loss [0.039128754, 0.027334727]
actor loss -177.17
train end 0.6469779014587402
should be 1.0, predicted: [ 1.05528474], target predicted: [ 1.1378386]
should be -1.0, predicted: [-0.78614384], target predicted: [-0.904387]
should be 1.0, predicted: [ 0.72534865], target predicted: [ 0.02408502]
should be 1.0, predicted: [ 1.05479431], target predicted: [ 0.79511034]
start searching new action
get new action:  0.24553132057189941
start getting new_q
get new_q:  0.06827783584594727
train start:
critic loss [0.055261277, 0.034429841]
critic loss [0.052994024, 0.034396794]
critic loss [0.048711129, 0.033937119]
critic loss [0.043466173, 0.033163141]
critic loss [0.038598359, 0.032238409]
actor loss -25.2188
train end 0.6328976154327393
should be -1.0, predicted: [-0.38514942], target predicted: [-0.64096332]
should be 1.0, predicted: [ 0.94746768], target predicted: 

critic loss [0.034771759, 0.018950108]
actor loss 111.426
train end 0.6162247657775879
should be 1.0, predicted: [ 0.99493486], target predicted: [ 0.83013296]
should be 1.0, predicted: [ 1.10779035], target predicted: [ 0.92703229]
should be 1.0, predicted: [ 1.02451265], target predicted: [ 0.45050639]
Episode : 835 Replay Buffer 50000
TOTAL REWARD @ 835-th Episode  : Reward -1
Total Step: 149
start searching new action
get new action:  0.25877952575683594
start getting new_q
get new_q:  0.0783841609954834
train start:
critic loss [0.033448182, 0.032046512]
critic loss [0.035190463, 0.033318385]
critic loss [0.036475759, 0.034191504]
critic loss [0.037068326, 0.034619667]
critic loss [0.036888197, 0.034582246]
actor loss -283.433
train end 0.5760452747344971
should be 1.0, predicted: [ 1.19225085], target predicted: [ 1.03903615]
should be 1.0, predicted: [ 1.24528658], target predicted: [ 0.81778473]
should be 1.0, predicted: [ 1.10433173], target predicted: [ 0.94829756]
start sear

get new action:  0.23934125900268555
start getting new_q
get new_q:  0.07372188568115234
train start:
critic loss [0.022062168, 0.01997013]
critic loss [0.022261731, 0.020288911]
critic loss [0.022355223, 0.020547826]
critic loss [0.022323672, 0.020715]
critic loss [0.022184087, 0.020789627]
actor loss -349.074
train end 0.518707275390625
should be 1.0, predicted: [ 1.11293304], target predicted: [ 1.16503334]
should be 1.0, predicted: [ 0.89107782], target predicted: [ 0.7878319]
start searching new action
get new action:  0.2570638656616211
start getting new_q
get new_q:  0.06527042388916016
train start:
critic loss [0.11885004, 0.029291427]
critic loss [0.10856844, 0.028280979]
critic loss [0.093371943, 0.026868841]
critic loss [0.075862058, 0.025328264]
critic loss [0.059286244, 0.023991544]
actor loss -126.003
train end 0.5502831935882568
should be 1.0, predicted: [ 0.97378707], target predicted: [ 1.17164648]
should be -1.0, predicted: [-0.52952224], target predicted: [-0.5978492

critic loss [0.027024006, 0.022650134]
critic loss [0.026391264, 0.022342613]
critic loss [0.02531217, 0.021909621]
critic loss [0.023989245, 0.021398164]
actor loss -233.861
train end 0.5259900093078613
should be 1.0, predicted: [ 0.9363513], target predicted: [ 0.85659593]
should be 1.0, predicted: [ 1.15619099], target predicted: [ 1.08468318]
start searching new action
get new action:  0.23803400993347168
start getting new_q
get new_q:  0.07015490531921387
train start:
critic loss [0.037818193, 0.037735112]
critic loss [0.037165567, 0.037023008]
critic loss [0.036303543, 0.036086939]
critic loss [0.035283223, 0.034982722]
critic loss [0.034158062, 0.033771627]
actor loss -946.288
train end 0.49747133255004883
should be 1.0, predicted: [ 0.88943446], target predicted: [ 0.87341088]
start searching new action
get new action:  0.23917293548583984
start getting new_q
get new_q:  0.07018589973449707
train start:
critic loss [0.029843982, 0.025362059]
critic loss [0.028822612, 0.02503736

Episode : 840 Replay Buffer 50000
TOTAL REWARD @ 840-th Episode  : Reward 1
Total Step: 138
start searching new action
get new action:  0.2641263008117676
start getting new_q
get new_q:  0.07024407386779785
train start:
critic loss [0.0261153, 0.0261153]
critic loss [0.026514368, 0.026514368]
critic loss [0.026774351, 0.026774351]
critic loss [0.026852939, 0.026852939]
critic loss [0.026731007, 0.026731007]
actor loss -543.589
train end 0.5769834518432617
start searching new action
get new action:  0.22695708274841309
start getting new_q
get new_q:  0.07088875770568848
train start:
critic loss [0.03183179, 0.021621091]
critic loss [0.030449394, 0.021073166]
critic loss [0.027903747, 0.020174658]
critic loss [0.024798352, 0.01910213]
critic loss [0.021759028, 0.018037694]
actor loss -93.6365
train end 0.5369114875793457
should be -1.0, predicted: [-1.02670634], target predicted: [-0.95041287]
should be -1.0, predicted: [-0.86821949], target predicted: [-1.00021434]
should be -1.0, predi

Episode : 842 Replay Buffer 50000
TOTAL REWARD @ 842-th Episode  : Reward 1
Total Step: 115
start searching new action
get new action:  0.24334931373596191
start getting new_q
get new_q:  0.06530117988586426
train start:
critic loss [0.038078867, 0.017173372]
critic loss [0.037447255, 0.017287392]
critic loss [0.035688683, 0.017171413]
critic loss [0.033105195, 0.016854182]
critic loss [0.030056428, 0.016394194]
actor loss -437.094
train end 0.5492477416992188
should be -1.0, predicted: [-0.32875738], target predicted: [-0.50465173]
should be 1.0, predicted: [ 1.00573325], target predicted: [ 0.92796355]
start searching new action
get new action:  0.23932480812072754
start getting new_q
get new_q:  0.06494355201721191
train start:
critic loss [0.014377858, 0.014377858]
critic loss [0.014377156, 0.014377156]
critic loss [0.014357155, 0.014357155]
critic loss [0.014310794, 0.014310794]
critic loss [0.014227808, 0.014227808]
actor loss -228.903
train end 0.5356366634368896
start searching

Episode : 844 Replay Buffer 50000
TOTAL REWARD @ 844-th Episode  : Reward -1
Total Step: 103
start searching new action
get new action:  0.24196386337280273
start getting new_q
get new_q:  0.0639193058013916
train start:
critic loss [0.019548468, 0.017845264]
critic loss [0.020957429, 0.01884521]
critic loss [0.022085812, 0.01969656]
critic loss [0.022752468, 0.020242862]
critic loss [0.022890603, 0.020422472]
actor loss -408.144
train end 0.5099093914031982
should be -1.0, predicted: [-0.70480728], target predicted: [-0.96984267]
start searching new action
get new action:  0.24305319786071777
start getting new_q
get new_q:  0.07449078559875488
train start:
critic loss [0.03440528, 0.023735534]
critic loss [0.033260297, 0.023665249]
critic loss [0.031772655, 0.023553848]
critic loss [0.030183349, 0.023438554]
critic loss [0.028698675, 0.023341466]
actor loss -211.817
train end 0.6805343627929688
should be 1.0, predicted: [ 0.94188505], target predicted: [ 1.02369356]
should be 1.0, pre

critic loss [0.01689242, 0.016692642]
actor loss -443.275
train end 0.5549721717834473
should be 1.0, predicted: [ 0.91669154], target predicted: [ 1.05439985]
should be 1.0, predicted: [ 0.91525507], target predicted: [ 0.8758437]
should be -1.0, predicted: [-1.02419758], target predicted: [-1.03576314]
Episode : 846 Replay Buffer 50000
TOTAL REWARD @ 846-th Episode  : Reward 1
Total Step: 118
start searching new action
get new action:  0.32138633728027344
start getting new_q
get new_q:  0.10306668281555176
train start:
critic loss [0.019345116, 0.018514086]
critic loss [0.019213788, 0.018402699]
critic loss [0.018992037, 0.018233998]
critic loss [0.018699169, 0.018019501]
critic loss [0.018356096, 0.017770968]
actor loss -303.419
train end 1.111189603805542
should be 1.0, predicted: [ 0.86964285], target predicted: [ 0.8833071]
should be -1.0, predicted: [-0.96400297], target predicted: [-1.02068865]
start searching new action
get new action:  0.2778477668762207
start getting new_q
g

Episode : 848 Replay Buffer 50000
TOTAL REWARD @ 848-th Episode  : Reward 1
Total Step: 126
start searching new action
get new action:  0.23782587051391602
start getting new_q
get new_q:  0.07272219657897949
train start:
critic loss [0.031774022, 0.02697086]
critic loss [0.030990234, 0.026777606]
critic loss [0.030001933, 0.02647789]
critic loss [0.028892923, 0.026092159]
critic loss [0.027769577, 0.025651921]
actor loss -806.079
train end 0.5625078678131104
should be 1.0, predicted: [ 0.77519113], target predicted: [ 0.7019164]
start searching new action
get new action:  0.26761317253112793
start getting new_q
get new_q:  0.09846162796020508
train start:
critic loss [0.019876543, 0.014127145]
critic loss [0.018614519, 0.014049737]
critic loss [0.01689576, 0.013855661]
critic loss [0.015199848, 0.013591079]
critic loss [0.01389764, 0.013299789]
actor loss -244.766
train end 0.8063404560089111
should be 1.0, predicted: [ 1.14176095], target predicted: [ 0.96404541]
should be 1.0, predic

critic loss [0.013834765, 0.013159185]
actor loss -491.573
train end 0.5179421901702881
should be 1.0, predicted: [ 1.02861118], target predicted: [ 1.15076888]
should be 1.0, predicted: [ 1.00568402], target predicted: [ 1.16941667]
should be -1.0, predicted: [-0.790299], target predicted: [-0.88783038]
trained action prob map predicted by initial model for a starting game
[[[  1.65502518e-12   8.46487087e-16   8.46835906e-16   8.46751944e-16
     8.52810559e-16   8.46729339e-16   8.74998164e-16]
  [  8.48352360e-16   8.46832677e-16   8.46493546e-16   8.46496775e-16
     8.74124078e-16   8.46545215e-16   8.46826218e-16]
  [  8.46832677e-16   8.46506463e-16   8.47120245e-16   8.46496775e-16
     8.46525839e-16   8.73800724e-16   8.57344515e-16]
  [  8.46500005e-16   8.46512922e-16   8.46500005e-16   8.46522610e-16
     8.46548444e-16   8.46745485e-16   8.46661470e-16]
  [  8.49764523e-16   8.46487087e-16   8.46503234e-16   8.46696993e-16
     8.46632407e-16   8.46596884e-16   8.4807728

critic loss [0.032984793, 0.01989986]
critic loss [0.030745277, 0.019554798]
critic loss [0.028186724, 0.019167434]
critic loss [0.025625963, 0.01878221]
actor loss -487.073
train end 0.528611421585083
should be -1.0, predicted: [-0.5655387], target predicted: [-0.29131791]
start searching new action
get new action:  0.2406940460205078
start getting new_q
get new_q:  0.06048846244812012
train start:
critic loss [0.018650658, 0.018650658]
critic loss [0.018465685, 0.018465685]
critic loss [0.018180648, 0.018180648]
critic loss [0.017806495, 0.017806495]
critic loss [0.017366314, 0.017366314]
actor loss -27.4073
train end 0.5135211944580078
start searching new action
get new action:  0.2415151596069336
start getting new_q
get new_q:  0.0668034553527832
train start:
critic loss [0.020773795, 0.020773795]
critic loss [0.020790733, 0.020790733]
critic loss [0.02073855, 0.02073855]
critic loss [0.020617742, 0.020617742]
critic loss [0.02043223, 0.02043223]
actor loss -1189.37
train end 0.549

get new action:  0.24327802658081055
start getting new_q
get new_q:  0.06084799766540527
train start:
critic loss [0.023020774, 0.02242668]
critic loss [0.022690166, 0.022215158]
critic loss [0.022214636, 0.021869551]
critic loss [0.02165499, 0.021433312]
critic loss [0.021061704, 0.02094144]
actor loss -688.829
train end 0.5386354923248291
should be -1.0, predicted: [-1.04069388], target predicted: [-0.93021601]
should be 1.0, predicted: [ 1.13836777], target predicted: [ 1.08152199]
start searching new action
get new action:  0.23355317115783691
start getting new_q
get new_q:  0.06715726852416992
train start:
critic loss [0.018962024, 0.015347507]
critic loss [0.017904902, 0.014830023]
critic loss [0.016727086, 0.014269652]
critic loss [0.015558663, 0.013718556]
critic loss [0.014503418, 0.01321901]
actor loss -357.563
train end 0.5188510417938232
should be 1.0, predicted: [ 1.1558857], target predicted: [ 1.13395095]
should be -1.0, predicted: [-1.07999766], target predicted: [-1.04

get new action:  0.22641253471374512
start getting new_q
get new_q:  0.06523776054382324
train start:
critic loss [0.01712418, 0.016251266]
critic loss [0.017199997, 0.016509037]
critic loss [0.016950231, 0.016505472]
critic loss [0.016480289, 0.016264187]
critic loss [0.015891626, 0.015826544]
actor loss -362.111
train end 0.6008610725402832
should be 1.0, predicted: [ 0.92725003], target predicted: [ 0.94929701]
start searching new action
get new action:  0.2213759422302246
start getting new_q
get new_q:  0.06279683113098145
train start:
critic loss [0.10753202, 0.031388581]
critic loss [0.10223462, 0.030680642]
critic loss [0.093129739, 0.029607233]
critic loss [0.081743926, 0.028373737]
critic loss [0.069925055, 0.027209844]
actor loss -61.5766
train end 0.5642573833465576
should be 1.0, predicted: [ 1.03543878], target predicted: [ 0.9990831]
should be 1.0, predicted: [ 0.13736981], target predicted: [ 0.13751899]
should be 1.0, predicted: [ 0.8079794], target predicted: [ 0.83491

critic loss [0.031244496, 0.030005708]
actor loss -351.84
train end 0.5268633365631104
should be 1.0, predicted: [ 0.99928248], target predicted: [ 0.71931946]
should be -1.0, predicted: [-1.01555836], target predicted: [-1.01230454]
start searching new action
get new action:  0.2257990837097168
start getting new_q
get new_q:  0.06335806846618652
train start:
critic loss [0.042251479, 0.035050713]
critic loss [0.040141612, 0.033799395]
critic loss [0.037223719, 0.032231979]
critic loss [0.033954032, 0.0305018]
critic loss [0.030766545, 0.028752491]
actor loss -687.639
train end 0.5492336750030518
should be 1.0, predicted: [ 1.22678936], target predicted: [ 1.0251385]
should be 1.0, predicted: [ 1.18228281], target predicted: [ 1.05883813]
start searching new action
get new action:  0.23331785202026367
start getting new_q
get new_q:  0.06738448143005371
train start:
critic loss [0.052121826, 0.026843578]
critic loss [0.046352196, 0.024785474]
critic loss [0.039689474, 0.022698507]
criti

critic loss [0.025103273, 0.021460846]
critic loss [0.024585471, 0.021239497]
critic loss [0.023850501, 0.020901257]
critic loss [0.022948075, 0.020473355]
actor loss -478.605
train end 0.5241889953613281
should be -1.0, predicted: [-0.93188232], target predicted: [-1.05597806]
should be 1.0, predicted: [ 0.86749846], target predicted: [ 0.94349074]
should be 1.0, predicted: [ 0.87518567], target predicted: [ 0.52621031]
start searching new action
get new action:  0.24048471450805664
start getting new_q
get new_q:  0.06192183494567871
train start:
critic loss [0.031870313, 0.031807289]
critic loss [0.031286716, 0.031265512]
critic loss [0.03056889, 0.030568298]
critic loss [0.029785383, 0.029774118]
critic loss [0.028991485, 0.028934333]
actor loss -846.387
train end 0.5066213607788086
should be -1.0, predicted: [-1.10838759], target predicted: [-1.12218833]
start searching new action
get new action:  0.23239374160766602
start getting new_q
get new_q:  0.06548929214477539
train start:


critic loss [0.018897049, 0.018571887]
actor loss -602.546
train end 0.5420560836791992
should be 1.0, predicted: [ 0.80581439], target predicted: [ 0.91030949]
start searching new action
get new action:  0.2547476291656494
start getting new_q
get new_q:  0.07404851913452148
train start:
critic loss [0.022501778, 0.018104928]
critic loss [0.021878801, 0.018228985]
critic loss [0.021112381, 0.018284557]
critic loss [0.020315412, 0.018265653]
critic loss [0.019554578, 0.018165883]
actor loss -382.964
train end 0.609717845916748
should be 1.0, predicted: [ 0.96936053], target predicted: [ 1.1001159]
should be 1.0, predicted: [ 0.78448522], target predicted: [ 1.02903104]
should be 1.0, predicted: [ 1.03723025], target predicted: [ 0.60292393]
should be 1.0, predicted: [ 0.89828539], target predicted: [ 0.76354015]
should be -1.0, predicted: [-0.94693744], target predicted: [-0.82677299]
should be -1.0, predicted: [-0.91798574], target predicted: [-0.80776644]
start searching new action
ge

get new action:  0.25580334663391113
start getting new_q
get new_q:  0.07831859588623047
train start:
critic loss [0.043459434, 0.034641743]
critic loss [0.044400919, 0.034129567]
critic loss [0.044654608, 0.03356494]
critic loss [0.044142939, 0.032919593]
critic loss [0.042920791, 0.032189861]
actor loss -501.965
train end 0.62772536277771
should be -1.0, predicted: [-0.41182879], target predicted: [-0.43587497]
start searching new action
get new action:  0.24845004081726074
start getting new_q
get new_q:  0.08092284202575684
train start:
critic loss [0.027059935, 0.027059935]
critic loss [0.027078651, 0.027078651]
critic loss [0.02690178, 0.02690178]
critic loss [0.026553696, 0.026553696]
critic loss [0.026054427, 0.026054427]
actor loss -595.253
train end 0.8238911628723145
start searching new action
get new action:  0.2894721031188965
start getting new_q
get new_q:  0.09602737426757812
train start:
critic loss [0.036215965, 0.031383008]
critic loss [0.035136968, 0.030509125]
critic



Episode : 864 Replay Buffer 50000
TOTAL REWARD @ 864-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2377488613128662
start getting new_q
get new_q:  0.06382060050964355
train start:
critic loss [0.028947635, 0.02326344]
critic loss [0.027733563, 0.02235906]
critic loss [0.026132751, 0.021349834]
critic loss [0.024353364, 0.020326234]
critic loss [0.022582589, 0.019360531]
actor loss nan
train end 0.5738651752471924
should be 1.0, predicted: [ 0.69497603], target predicted: [ 1.02765965]
should be 1.0, predicted: [ 0.92376792], target predicted: [ 1.12600529]
start searching new action
get new action:  0.2425854206085205
start getting new_q
get new_q:  0.07212018966674805
train start:
critic loss [0.16046114, 0.034576207]
critic loss [0.15321872, 0.034267478]
critic loss [0.14199428, 0.033590388]
critic loss [0.12854326, 0.032804098]
critic loss [0.11450362, 0.03214024]
actor loss nan
train end 0.685915470123291
should be 1.0, predicted: [-0.5491992

Episode : 866 Replay Buffer 50000
TOTAL REWARD @ 866-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2792675495147705
start getting new_q
get new_q:  0.07401776313781738
train start:
critic loss [0.11335593, 0.046394959]
critic loss [0.11235069, 0.045839466]
critic loss [0.10751745, 0.044944376]
critic loss [0.099674642, 0.0437566]
critic loss [0.089897729, 0.04240904]
actor loss nan
train end 0.6243081092834473
should be 1.0, predicted: [-0.14916442], target predicted: [-0.21906392]
should be 1.0, predicted: [ 0.67812085], target predicted: [ 0.62125665]
should be -1.0, predicted: [-1.35115683], target predicted: [-1.02928627]
start searching new action
get new action:  0.24367213249206543
start getting new_q
get new_q:  0.0723416805267334
train start:
critic loss [0.015275488, 0.01311525]
critic loss [0.014814533, 0.013078514]
critic loss [0.014471583, 0.013094986]
critic loss [0.014238756, 0.013143856]
critic loss [0.014094219, 0.013201471]
actor

get new action:  0.22614288330078125
start getting new_q
get new_q:  0.06101179122924805
train start:
critic loss [0.021700464, 0.021470372]
critic loss [0.021470264, 0.021307033]
critic loss [0.021248288, 0.021140993]
critic loss [0.021030476, 0.020966142]
critic loss [0.020803448, 0.020769432]
actor loss nan
train end 0.5259730815887451
should be -1.0, predicted: [-0.99545795], target predicted: [-1.01133704]
start searching new action
get new action:  0.2641265392303467
start getting new_q
get new_q:  0.11032581329345703
train start:
critic loss [0.026208531, 0.022907495]
critic loss [0.025887722, 0.022858534]
critic loss [0.025051909, 0.022546306]
critic loss [0.023861391, 0.022012491]
critic loss [0.022502933, 0.021315088]
actor loss nan
train end 0.7903356552124023
should be 1.0, predicted: [ 0.1786426], target predicted: [ 0.15363014]
Episode : 868 Replay Buffer 50000
TOTAL REWARD @ 868-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.24625802

critic loss [0.031329945, 0.028342847]
critic loss [0.029649271, 0.027465764]
critic loss [0.02818343, 0.026494876]
critic loss [0.027056405, 0.025564848]
actor loss nan
train end 0.5205545425415039
should be 1.0, predicted: [ 0.94366962], target predicted: [ 0.77990472]
should be 1.0, predicted: [ 1.14142907], target predicted: [ 1.06293106]
should be 1.0, predicted: [ 0.94021547], target predicted: [ 0.88409632]
should be -1.0, predicted: [-1.00591087], target predicted: [-1.12580884]
start searching new action
get new action:  0.24341368675231934
start getting new_q
get new_q:  0.06453657150268555
train start:
critic loss [0.028184567, 0.027666727]
critic loss [0.028647209, 0.028166173]
critic loss [0.028971851, 0.028530121]
critic loss [0.029109314, 0.0287067]
critic loss [0.029033169, 0.028667606]
actor loss nan
train end 0.5199887752532959
should be -1.0, predicted: [-0.84492648], target predicted: [-0.97470796]
start searching new action
get new action:  0.23087430000305176
star

get new action:  0.24635624885559082
start getting new_q
get new_q:  0.07265186309814453
train start:
critic loss [0.020825926, 0.020825926]
critic loss [0.020715987, 0.020715987]
critic loss [0.020596266, 0.020596266]
critic loss [0.020460457, 0.020460457]
critic loss [0.020302761, 0.020302761]
actor loss nan
train end 0.7291915416717529
start searching new action
get new action:  0.23749923706054688
start getting new_q
get new_q:  0.061653852462768555
train start:
critic loss [0.025860395, 0.025743157]
critic loss [0.025667993, 0.025523284]
critic loss [0.025354337, 0.025179677]
critic loss [0.024942491, 0.024735754]
critic loss [0.024455784, 0.02421537]
actor loss nan
train end 0.611302375793457
should be 1.0, predicted: [ 0.89043653], target predicted: [ 0.92502505]
should be -1.0, predicted: [-0.80243164], target predicted: [-0.87504387]
start searching new action
get new action:  0.23482918739318848
start getting new_q
get new_q:  0.0703427791595459
train start:
critic loss [0.02

get new action:  0.23894929885864258
start getting new_q
get new_q:  0.06476545333862305
train start:
critic loss [0.071363933, 0.026927352]
critic loss [0.065088235, 0.026012177]
critic loss [0.05423905, 0.024547901]
critic loss [0.042077973, 0.022894708]
critic loss [0.031052735, 0.021344565]
actor loss nan
train end 0.5148582458496094
should be 1.0, predicted: [ 0.87009144], target predicted: [ 0.85264111]
should be 1.0, predicted: [ 1.0165441], target predicted: [ 0.94403458]
should be -1.0, predicted: [-0.798994], target predicted: [-0.74604833]
should be 1.0, predicted: [ 0.85605544], target predicted: [ 0.42229298]
start searching new action
get new action:  0.2336876392364502
start getting new_q
get new_q:  0.07146525382995605
train start:
critic loss [0.027060248, 0.025124354]
critic loss [0.026349099, 0.024928635]
critic loss [0.02574588, 0.024747629]
critic loss [0.025191925, 0.024549719]
critic loss [0.024686847, 0.024321264]
actor loss nan
train end 0.63039231300354
should

get new action:  0.2286510467529297
start getting new_q
get new_q:  0.05978274345397949
train start:
critic loss [0.11439567, 0.02271178]
critic loss [0.1106894, 0.022590728]
critic loss [0.10415974, 0.022263033]
critic loss [0.095503122, 0.021771859]
critic loss [0.085573882, 0.021179913]
actor loss nan
train end 0.5365304946899414
should be 1.0, predicted: [ 1.09757221], target predicted: [ 1.0027405]
should be 1.0, predicted: [ 1.18534338], target predicted: [ 0.90988159]
should be 1.0, predicted: [ 0.20861474], target predicted: [-0.55554253]
should be -1.0, predicted: [-0.02988282], target predicted: [-0.6228978]
start searching new action
get new action:  0.23419761657714844
start getting new_q
get new_q:  0.06899476051330566
train start:
critic loss [0.035027787, 0.032645196]
critic loss [0.034944363, 0.033091906]
critic loss [0.034501929, 0.033148896]
critic loss [0.033789758, 0.032868087]
critic loss [0.0328971, 0.032317564]
actor loss nan
train end 0.5344202518463135
should b

start searching new action
get new action:  0.2401423454284668
start getting new_q
get new_q:  0.06963205337524414
train start:
critic loss [0.077679604, 0.039467677]
critic loss [0.074033327, 0.039332755]
critic loss [0.06807524, 0.038768422]
critic loss [0.060896851, 0.037899572]
critic loss [0.053835344, 0.036943845]
actor loss nan
train end 0.5688574314117432
should be -1.0, predicted: [-0.40214285], target predicted: [-0.62123024]
should be 1.0, predicted: [ 0.87495369], target predicted: [ 0.11135964]
should be 1.0, predicted: [ 0.92234862], target predicted: [ 0.4568215]
start searching new action
get new action:  0.2361292839050293
start getting new_q
get new_q:  0.06338644027709961
train start:
critic loss [0.032096907, 0.027508046]
critic loss [0.032897882, 0.028289752]
critic loss [0.033230808, 0.028800638]
critic loss [0.033059645, 0.028985823]
critic loss [0.03241805, 0.028836608]
actor loss nan
train end 0.5271527767181396
should be 1.0, predicted: [ 0.87691879], target p

critic loss [0.020358449, 0.018103406]
actor loss nan
train end 0.5685861110687256
should be -1.0, predicted: [-1.11864173], target predicted: [-1.05505204]
should be -1.0, predicted: [-1.09420764], target predicted: [-1.11569107]
should be 1.0, predicted: [ 1.23306084], target predicted: [ 1.35431731]
start searching new action
get new action:  0.2285168170928955
start getting new_q
get new_q:  0.06531739234924316
train start:
critic loss [0.024971906, 0.023007493]
critic loss [0.024625232, 0.02267928]
critic loss [0.024040863, 0.022235498]
critic loss [0.023280688, 0.021714568]
critic loss [0.022425829, 0.021157233]
actor loss nan
train end 0.5043425559997559
should be 1.0, predicted: [ 1.05038452], target predicted: [ 1.00742722]
should be 1.0, predicted: [ 0.99988186], target predicted: [ 1.02254236]
should be 1.0, predicted: [ 1.0006038], target predicted: [ 0.91510934]
start searching new action
get new action:  0.23316311836242676
start getting new_q
get new_q:  0.06930780410766

Episode : 881 Replay Buffer 50000
TOTAL REWARD @ 881-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.23670029640197754
start getting new_q
get new_q:  0.07271075248718262
train start:
critic loss [0.075117588, 0.02826217]
critic loss [0.067340605, 0.027907863]
critic loss [0.058450237, 0.02762292]
critic loss [0.050467741, 0.027514301]
critic loss [0.044250377, 0.027543992]
actor loss nan
train end 0.5484251976013184
should be 1.0, predicted: [ 0.85297787], target predicted: [ 0.38970667]
should be 1.0, predicted: [ 1.03606045], target predicted: [ 0.93235713]
should be -1.0, predicted: [-0.39775476], target predicted: [-0.34221092]
start searching new action
get new action:  0.2366771697998047
start getting new_q
get new_q:  0.06335854530334473
train start:
critic loss [0.024734376, 0.024734376]
critic loss [0.025705922, 0.025705922]
critic loss [0.026197061, 0.026197061]
critic loss [0.026244191, 0.026244191]
critic loss [0.025919681, 0.025919681]

get new action:  0.254727840423584
start getting new_q
get new_q:  0.06733512878417969
train start:
critic loss [0.057824753, 0.026524441]
critic loss [0.052983962, 0.025928201]
critic loss [0.045285486, 0.025115781]
critic loss [0.036928385, 0.024289303]
critic loss [0.029890193, 0.023611648]
actor loss nan
train end 0.9053456783294678
should be 1.0, predicted: [ 0.58224458], target predicted: [ 0.01620756]
start searching new action
get new action:  0.24314570426940918
start getting new_q
get new_q:  0.07332158088684082
train start:
critic loss [0.046485066, 0.046485066]
critic loss [0.046593584, 0.046593584]
critic loss [0.046433784, 0.046433784]
critic loss [0.046041019, 0.046041019]
critic loss [0.045454383, 0.045454383]
actor loss nan
train end 0.6943387985229492
start searching new action
get new action:  0.22878479957580566
start getting new_q
get new_q:  0.07258892059326172
train start:
critic loss [0.038887814, 0.030032998]
critic loss [0.037132107, 0.02931691]
critic loss [0

get new action:  0.2300262451171875
start getting new_q
get new_q:  0.06423020362854004
train start:
critic loss [0.046270773, 0.027627693]
critic loss [0.043473341, 0.027408667]
critic loss [0.039594807, 0.0269817]
critic loss [0.035378158, 0.026445542]
critic loss [0.031478886, 0.025870256]
actor loss nan
train end 0.518531084060669
should be 1.0, predicted: [ 1.06940091], target predicted: [ 0.57031536]
should be -1.0, predicted: [-0.87820083], target predicted: [-0.94221777]
should be -1.0, predicted: [-0.96979809], target predicted: [-0.95197225]
should be 1.0, predicted: [ 1.12032688], target predicted: [ 0.94447356]
start searching new action
get new action:  0.2309279441833496
start getting new_q
get new_q:  0.05996847152709961
train start:
critic loss [0.040136509, 0.033879191]
critic loss [0.039668538, 0.034004726]
critic loss [0.038917903, 0.033931948]
critic loss [0.037936445, 0.033669088]
critic loss [0.036792118, 0.033241894]
actor loss nan
train end 0.5019588470458984
sh

get new action:  0.23058509826660156
start getting new_q
get new_q:  0.06491231918334961
train start:
critic loss [0.038419016, 0.032504439]
critic loss [0.037710793, 0.032017563]
critic loss [0.036783691, 0.031476747]
critic loss [0.035741724, 0.030922091]
critic loss [0.034670793, 0.030369688]
actor loss nan
train end 0.6192729473114014
should be -1.0, predicted: [-0.89569509], target predicted: [-0.75311422]
should be 1.0, predicted: [ 0.72992498], target predicted: [ 0.81485379]
should be -1.0, predicted: [-1.22323596], target predicted: [-1.11048031]
should be 1.0, predicted: [ 0.96416384], target predicted: [ 1.11535513]
should be 1.0, predicted: [ 1.04711676], target predicted: [ 1.03655005]
start searching new action
get new action:  0.23166704177856445
start getting new_q
get new_q:  0.0646049976348877
train start:
critic loss [0.028949317, 0.021831442]
critic loss [0.028335715, 0.021674449]
critic loss [0.02754125, 0.021509711]
critic loss [0.026614081, 0.02134319]
critic los

critic loss [0.027952095, 0.026716813]
critic loss [0.02814427, 0.026855081]
critic loss [0.028123863, 0.026862644]
critic loss [0.027891766, 0.026734903]
critic loss [0.027468953, 0.026474793]
actor loss nan
train end 0.6960122585296631
should be -1.0, predicted: [-1.06759107], target predicted: [-1.04561353]
should be 1.0, predicted: [ 1.21474421], target predicted: [ 1.02513015]
start searching new action
get new action:  0.23520207405090332
start getting new_q
get new_q:  0.06701874732971191
train start:
critic loss [0.042739276, 0.026247587]
critic loss [0.041284822, 0.025589602]
critic loss [0.03852259, 0.024534434]
critic loss [0.035008185, 0.023252897]
critic loss [0.031215701, 0.021910738]
actor loss nan
train end 0.639686107635498
should be -1.0, predicted: [-0.75826991], target predicted: [-0.54815722]
start searching new action
get new action:  0.22717022895812988
start getting new_q
get new_q:  0.06901335716247559
train start:
critic loss [0.033934109, 0.025748108]
critic 

critic loss [0.044879436, 0.038565576]
actor loss nan
train end 0.5262007713317871
should be 1.0, predicted: [ 0.9360559], target predicted: [ 0.9656359]
should be 1.0, predicted: [ 0.8771044], target predicted: [ 0.88362789]
should be 1.0, predicted: [ 1.06192815], target predicted: [ 0.67210197]
start searching new action
get new action:  0.22045516967773438
start getting new_q
get new_q:  0.06330752372741699
train start:
critic loss [0.039295468, 0.023782123]
critic loss [0.036165323, 0.022763284]
critic loss [0.03235583, 0.021692306]
critic loss [0.028412843, 0.020669611]
critic loss [0.024801597, 0.019782081]
actor loss nan
train end 0.5110580921173096
should be 1.0, predicted: [ 0.95137703], target predicted: [ 0.85077721]
should be 1.0, predicted: [ 1.15076399], target predicted: [ 0.75217491]
should be 1.0, predicted: [ 0.68081975], target predicted: [ 0.74733871]
start searching new action
get new action:  0.22925591468811035
start getting new_q
get new_q:  0.07616829872131348

get new action:  0.260068416595459
start getting new_q
get new_q:  0.08700728416442871
train start:
critic loss [0.027431069, 0.024609545]
critic loss [0.027068542, 0.024446351]
critic loss [0.02659988, 0.024269393]
critic loss [0.026091428, 0.024105828]
critic loss [0.02558982, 0.023965327]
actor loss nan
train end 0.779041051864624
should be -1.0, predicted: [-0.73264766], target predicted: [-0.66431373]
should be 1.0, predicted: [ 0.95076716], target predicted: [ 1.00738502]
start searching new action
get new action:  0.23245859146118164
start getting new_q
get new_q:  0.06599164009094238
train start:
critic loss [0.027226966, 0.024447637]
critic loss [0.027350847, 0.024389721]
critic loss [0.027213894, 0.02420906]
critic loss [0.026840925, 0.023922207]
critic loss [0.02626566, 0.02354572]
actor loss nan
train end 0.5707540512084961
should be 1.0, predicted: [ 0.4925155], target predicted: [ 0.94669193]
start searching new action
get new action:  0.22778725624084473
start getting ne

get new action:  0.2370166778564453
start getting new_q
get new_q:  0.06413722038269043
train start:
critic loss [0.019098226, 0.019098226]
critic loss [0.018336739, 0.018336739]
critic loss [0.017602935, 0.017602935]
critic loss [0.016934, 0.016934]
critic loss [0.016349994, 0.016349994]
actor loss nan
train end 0.6110568046569824
start searching new action
get new action:  0.22387480735778809
start getting new_q
get new_q:  0.06054520606994629
train start:
critic loss [0.047277562, 0.030928142]
critic loss [0.046474688, 0.030825909]
critic loss [0.044679798, 0.030521318]
critic loss [0.042170756, 0.030025389]
critic loss [0.039226212, 0.029381623]
actor loss nan
train end 0.6259024143218994
should be 1.0, predicted: [ 1.02725685], target predicted: [ 1.16588223]
should be -1.0, predicted: [-0.42109439], target predicted: [-0.63945353]
should be 1.0, predicted: [ 0.93686384], target predicted: [ 0.84884071]
start searching new action
get new action:  0.2280111312866211
start getting n

get new action:  0.23984789848327637
start getting new_q
get new_q:  0.06984210014343262
train start:
critic loss [0.030822093, 0.030718904]
critic loss [0.03008287, 0.030072361]
critic loss [0.029465754, 0.029456653]
critic loss [0.028936228, 0.028864427]
critic loss [0.028440174, 0.02827049]
actor loss nan
train end 0.5510921478271484
should be 1.0, predicted: [ 0.9974798], target predicted: [ 1.09971201]
start searching new action
get new action:  0.24331426620483398
start getting new_q
get new_q:  0.06914615631103516
train start:
critic loss [0.026931062, 0.026496226]
critic loss [0.026517723, 0.026239021]
critic loss [0.026059512, 0.02590235]
critic loss [0.025567923, 0.025493396]
critic loss [0.025047887, 0.025021257]
actor loss nan
train end 0.6141932010650635
should be -1.0, predicted: [-1.01604044], target predicted: [-1.18239987]
start searching new action
get new action:  0.24738144874572754
start getting new_q
get new_q:  0.07062149047851562
train start:
critic loss [0.0212

critic loss [0.02154131, 0.018567042]
critic loss [0.021185914, 0.018429726]
critic loss [0.020609729, 0.018185122]
critic loss [0.019883277, 0.017860051]
actor loss nan
train end 0.5566949844360352
should be 1.0, predicted: [ 1.0027169], target predicted: [ 0.84558976]
should be 1.0, predicted: [ 0.90278095], target predicted: [ 0.94174463]
start searching new action
get new action:  0.23603391647338867
start getting new_q
get new_q:  0.07519102096557617
train start:
critic loss [0.033683375, 0.024571173]
critic loss [0.032342479, 0.024396783]
critic loss [0.030571936, 0.024124147]
critic loss [0.028651211, 0.023795135]
critic loss [0.026888601, 0.023452403]
actor loss nan
train end 0.7475600242614746
should be 1.0, predicted: [ 1.14118814], target predicted: [ 0.6737445]
should be 1.0, predicted: [ 1.04227746], target predicted: [ 0.49332339]
should be 1.0, predicted: [ 1.20659626], target predicted: [ 1.00413811]
should be 1.0, predicted: [ 1.20616317], target predicted: [ 1.0151361

start searching new action
get new action:  0.26865720748901367
start getting new_q
get new_q:  0.0750737190246582
train start:
critic loss [0.021353744, 0.021353744]
critic loss [0.020941779, 0.020941779]
critic loss [0.020424206, 0.020424206]
critic loss [0.019834179, 0.019834179]
critic loss [0.019191653, 0.019191653]
actor loss nan
train end 0.7835652828216553
start searching new action
get new action:  0.269641637802124
start getting new_q
get new_q:  0.07158732414245605
train start:
critic loss [0.028680135, 0.02447878]
critic loss [0.028547481, 0.024409752]
critic loss [0.027775798, 0.024021693]
critic loss [0.026495323, 0.023353985]
critic loss [0.02487232, 0.022462962]
actor loss nan
train end 0.8736791610717773
should be 1.0, predicted: [ 1.42997706], target predicted: [ 0.98940212]
start searching new action
get new action:  0.2788243293762207
start getting new_q
get new_q:  0.06905722618103027
train start:
critic loss [0.025313316, 0.021835154]
critic loss [0.023865726, 0.0

get new action:  0.25997042655944824
start getting new_q
get new_q:  0.06225705146789551
train start:
critic loss [0.015359422, 0.014138866]
critic loss [0.015177806, 0.014043434]
critic loss [0.014788099, 0.013810109]
critic loss [0.014240826, 0.013462064]
critic loss [0.013603951, 0.013036491]
actor loss nan
train end 0.7190418243408203
should be 1.0, predicted: [ 1.04723895], target predicted: [ 1.01021767]
should be 1.0, predicted: [ 1.10257518], target predicted: [ 1.06616497]
start searching new action
get new action:  0.26273059844970703
start getting new_q
get new_q:  0.06578230857849121
train start:
critic loss [0.081540242, 0.019642543]
critic loss [0.073920935, 0.018808238]
critic loss [0.060525376, 0.017287893]
critic loss [0.045364611, 0.015578381]
critic loss [0.031318925, 0.014057018]
actor loss nan
train end 0.6212987899780273
should be 1.0, predicted: [ 0.99603063], target predicted: [ 0.90911198]
should be 1.0, predicted: [ 1.00263655], target predicted: [ 0.9594413]


get new action:  0.2663130760192871
start getting new_q
get new_q:  0.07055473327636719
train start:
critic loss [0.015791725, 0.012230604]
critic loss [0.015236374, 0.011665911]
critic loss [0.014419761, 0.011105934]
critic loss [0.01341984, 0.010574581]
critic loss [0.012334173, 0.010089165]
actor loss nan
train end 0.870819091796875
should be 1.0, predicted: [ 0.91357613], target predicted: [ 0.97954088]
should be 1.0, predicted: [ 1.10924649], target predicted: [ 1.14910638]
start searching new action
get new action:  0.2675795555114746
start getting new_q
get new_q:  0.08316326141357422
train start:
critic loss [0.070142545, 0.020012531]
critic loss [0.061331511, 0.018263938]
critic loss [0.047067944, 0.015850045]
critic loss [0.033187568, 0.013509491]
critic loss [0.024709027, 0.011909809]
actor loss nan
train end 1.0953338146209717
should be 1.0, predicted: [ 0.66073024], target predicted: [ 1.0067395]
should be 1.0, predicted: [ 0.96565783], target predicted: [ 1.13423657]
star

critic loss [0.012474307, 0.012474307]
actor loss nan
train end 0.6016018390655518
start searching new action
get new action:  0.25984716415405273
start getting new_q
get new_q:  0.0659186840057373
train start:
critic loss [0.013156865, 0.011406012]
critic loss [0.012713695, 0.011115786]
critic loss [0.012135824, 0.010802955]
critic loss [0.011498867, 0.010488544]
critic loss [0.010874879, 0.010190366]
actor loss nan
train end 0.5618789196014404
should be 1.0, predicted: [ 0.87826121], target predicted: [ 0.8014186]
start searching new action
get new action:  0.26227807998657227
start getting new_q
get new_q:  0.06574869155883789
train start:
critic loss [0.015002782, 0.015002782]
critic loss [0.014258949, 0.014258949]
critic loss [0.013455473, 0.013455473]
critic loss [0.012639808, 0.012639808]
critic loss [0.01184976, 0.01184976]
actor loss nan
train end 0.5097754001617432
start searching new action
get new action:  0.2525343894958496
start getting new_q
get new_q:  0.070119619369506

critic loss [0.0097151669, 0.0097151669]
critic loss [0.0094805798, 0.0094805798]
critic loss [0.0092610037, 0.0092610037]
critic loss [0.0090784598, 0.0090784598]
actor loss nan
train end 0.6599352359771729
start searching new action
get new action:  0.2680695056915283
start getting new_q
get new_q:  0.06667065620422363
train start:
critic loss [0.018025439, 0.018014446]
critic loss [0.017825294, 0.017819319]
critic loss [0.017583858, 0.017582361]
critic loss [0.01731177, 0.017311709]
critic loss [0.017014589, 0.017011005]
actor loss nan
train end 0.608738899230957
should be 1.0, predicted: [ 1.00186646], target predicted: [ 0.73140407]
start searching new action
get new action:  0.2429358959197998
start getting new_q
get new_q:  0.0794210433959961
train start:
critic loss [0.013557551, 0.013557551]
critic loss [0.013366395, 0.013366395]
critic loss [0.013091247, 0.013091247]
critic loss [0.012744493, 0.012744493]
critic loss [0.012344408, 0.012344408]
actor loss nan
train end 0.57928

critic loss [0.017406456, 0.009822078]
critic loss [0.013981757, 0.0092942342]
actor loss nan
train end 0.6866133213043213
should be 1.0, predicted: [ 0.82444525], target predicted: [ 0.70629662]
should be 1.0, predicted: [ 0.99012369], target predicted: [ 0.91289157]
should be 1.0, predicted: [ 0.73158205], target predicted: [ 0.66132694]
start searching new action
get new action:  0.25125741958618164
start getting new_q
get new_q:  0.06685185432434082
train start:
critic loss [0.024464676, 0.017282328]
critic loss [0.02486454, 0.017556507]
critic loss [0.024136335, 0.017451491]
critic loss [0.022535795, 0.017022166]
critic loss [0.020425126, 0.016363978]
actor loss nan
train end 0.8833527565002441
should be 1.0, predicted: [ 1.06927156], target predicted: [ 0.91136646]
should be -1.0, predicted: [-0.83005506], target predicted: [-0.43722051]
should be 1.0, predicted: [ 1.08267415], target predicted: [ 0.96515697]
start searching new action
get new action:  0.27347826957702637
start g

get new action:  0.24605298042297363
start getting new_q
get new_q:  0.0683736801147461
train start:
critic loss [0.023024963, 0.017079631]
critic loss [0.021744348, 0.01649341]
critic loss [0.020173099, 0.015866235]
critic loss [0.018545903, 0.015276914]
critic loss [0.017042458, 0.014780029]
actor loss nan
train end 0.5558710098266602
should be -1.0, predicted: [-0.80481642], target predicted: [-0.83126068]
Episode : 913 Replay Buffer 50000
TOTAL REWARD @ 913-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2631509304046631
start getting new_q
get new_q:  0.06879115104675293
train start:
critic loss [0.013554589, 0.008559892]
critic loss [0.013178481, 0.008543158]
critic loss [0.012707083, 0.0085831676]
critic loss [0.012181559, 0.0086611211]
critic loss [0.01163701, 0.0087506874]
actor loss nan
train end 0.5153079032897949
should be 1.0, predicted: [ 0.81715429], target predicted: [ 0.87971085]
should be -1.0, predicted: [-0.90318739], target pred

critic loss [0.0075623421, 0.0074600554]
critic loss [0.0075150821, 0.007374228]
critic loss [0.0074270824, 0.0072601428]
critic loss [0.0072964472, 0.0071201869]
actor loss nan
train end 0.779801607131958
should be 1.0, predicted: [ 0.99103266], target predicted: [ 0.92451096]
Episode : 915 Replay Buffer 50000
TOTAL REWARD @ 915-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2673377990722656
start getting new_q
get new_q:  0.0666358470916748
train start:
critic loss [0.012723086, 0.012723086]
critic loss [0.012616026, 0.012616026]
critic loss [0.012468718, 0.012468718]
critic loss [0.012282691, 0.012282691]
critic loss [0.012062512, 0.012062512]
actor loss nan
train end 0.6924386024475098
start searching new action
get new action:  0.25693225860595703
start getting new_q
get new_q:  0.07203865051269531
train start:
critic loss [0.0070665674, 0.0068441401]
critic loss [0.0070088538, 0.0067771715]
critic loss [0.0069292281, 0.0067009488]
critic loss

get new action:  0.2621021270751953
start getting new_q
get new_q:  0.06696486473083496
train start:
critic loss [0.010119924, 0.010119924]
critic loss [0.0097839907, 0.0097839907]
critic loss [0.0093459096, 0.0093459096]
critic loss [0.0088523794, 0.0088523794]
critic loss [0.0083476277, 0.0083476277]
actor loss nan
train end 0.5149266719818115
Episode : 917 Replay Buffer 50000
TOTAL REWARD @ 917-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.26945066452026367
start getting new_q
get new_q:  0.07001829147338867
train start:
critic loss [0.019401513, 0.010502126]
critic loss [0.018618055, 0.0101292]
critic loss [0.0176524, 0.0097895805]
critic loss [0.016591581, 0.0095003508]
critic loss [0.015503991, 0.0092671774]
actor loss nan
train end 0.6499853134155273
should be 1.0, predicted: [ 0.61596638], target predicted: [ 0.61561096]
should be 1.0, predicted: [ 0.83630675], target predicted: [ 0.88998896]
should be -1.0, predicted: [-1.11276841], targe

critic loss [0.01440187, 0.010819743]
actor loss nan
train end 0.503854513168335
should be -1.0, predicted: [-0.62215167], target predicted: [-0.63196665]
should be 1.0, predicted: [ 0.83789939], target predicted: [ 0.80280066]
should be 1.0, predicted: [ 0.94750613], target predicted: [ 0.89080507]
Episode : 919 Replay Buffer 50000
TOTAL REWARD @ 919-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.24543237686157227
start getting new_q
get new_q:  0.07190299034118652
train start:
critic loss [0.015933637, 0.015004631]
critic loss [0.015740201, 0.015000814]
critic loss [0.015503984, 0.014976675]
critic loss [0.015238494, 0.014909315]
critic loss [0.014949627, 0.014778261]
actor loss nan
train end 0.5137486457824707
should be 1.0, predicted: [ 1.04058921], target predicted: [ 1.06081462]
start searching new action
get new action:  0.25197482109069824
start getting new_q
get new_q:  0.0604097843170166
train start:
critic loss [0.016432729, 0.0094127897

get new action:  0.24283266067504883
start getting new_q
get new_q:  0.06222963333129883
train start:
critic loss [0.019996408, 0.011101383]
critic loss [0.019788725, 0.011468438]
critic loss [0.019377643, 0.01192626]
critic loss [0.018757459, 0.012354202]
critic loss [0.017931979, 0.012652414]
actor loss nan
train end 0.5021305084228516
should be -1.0, predicted: [-1.0714891], target predicted: [-0.97227651]
should be -1.0, predicted: [-0.82817447], target predicted: [-0.34297419]
Episode : 921 Replay Buffer 50000
TOTAL REWARD @ 921-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2624177932739258
start getting new_q
get new_q:  0.06554841995239258
train start:
critic loss [0.020997815, 0.019790307]
critic loss [0.020873111, 0.019702859]
critic loss [0.020169728, 0.01907834]
critic loss [0.018993873, 0.018058194]
critic loss [0.017547419, 0.016829859]
actor loss nan
train end 0.9084243774414062
should be 1.0, predicted: [ 0.79210609], target predict

get new action:  0.25803232192993164
start getting new_q
get new_q:  0.06973838806152344
train start:
critic loss [0.008815147, 0.008815147]
critic loss [0.008785665, 0.008785665]
critic loss [0.0087373815, 0.0087373815]
critic loss [0.0086650178, 0.0086650178]
critic loss [0.0085580833, 0.0085580833]
actor loss nan
train end 0.5960416793823242
start searching new action
get new action:  0.26787328720092773
start getting new_q
get new_q:  0.07790422439575195
train start:
critic loss [0.011393338, 0.011393338]
critic loss [0.011281801, 0.011281801]
critic loss [0.011106149, 0.011106149]
critic loss [0.010883789, 0.010883789]
critic loss [0.010630378, 0.010630378]
actor loss nan
train end 0.6756856441497803
Episode : 923 Replay Buffer 50000
TOTAL REWARD @ 923-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2706899642944336
start getting new_q
get new_q:  0.06995010375976562
train start:
critic loss [0.015754111, 0.013800693]
critic loss [0.015419809, 

critic loss [0.010043785, 0.0099049946]
actor loss nan
train end 0.5305421352386475
should be 1.0, predicted: [ 1.02886283], target predicted: [ 1.0086143]
should be 1.0, predicted: [ 1.01437426], target predicted: [ 1.03091645]
should be 1.0, predicted: [ 1.09722567], target predicted: [ 1.06053936]
start searching new action
get new action:  0.2552917003631592
start getting new_q
get new_q:  0.06574177742004395
train start:
critic loss [0.030510826, 0.016820416]
critic loss [0.028789258, 0.016679971]
critic loss [0.026153505, 0.016413357]
critic loss [0.023259083, 0.016078927]
critic loss [0.020516973, 0.015714701]
actor loss nan
train end 0.534400463104248
should be 1.0, predicted: [ 1.20124328], target predicted: [ 1.10956752]
should be 1.0, predicted: [ 1.20568657], target predicted: [ 1.0749048]
should be 1.0, predicted: [ 0.90617132], target predicted: [ 0.95881701]
should be 1.0, predicted: [ 0.87943256], target predicted: [ 0.91754603]
Episode : 925 Replay Buffer 50000
TOTAL R

get new action:  0.24657297134399414
start getting new_q
get new_q:  0.06007885932922363
train start:
critic loss [0.01633135, 0.014533209]
critic loss [0.015982419, 0.014349114]
critic loss [0.015318491, 0.013937594]
critic loss [0.014421547, 0.013335135]
critic loss [0.013384698, 0.01259549]
actor loss nan
train end 0.5171575546264648
should be 1.0, predicted: [ 1.09731162], target predicted: [ 0.9391402]
should be 1.0, predicted: [ 1.13925016], target predicted: [ 1.13850307]
start searching new action
get new action:  0.25972485542297363
start getting new_q
get new_q:  0.0610203742980957
train start:
critic loss [0.013408079, 0.011720837]
critic loss [0.013025025, 0.011526389]
critic loss [0.012522779, 0.011267082]
critic loss [0.011959981, 0.010962337]
critic loss [0.011395621, 0.010635165]
actor loss nan
train end 0.5489201545715332
should be 1.0, predicted: [ 1.04495192], target predicted: [ 1.03754258]
should be 1.0, predicted: [ 0.99503642], target predicted: [ 1.05257893]
sta

get new action:  0.2792799472808838
start getting new_q
get new_q:  0.0666496753692627
train start:
critic loss [0.01210802, 0.010720777]
critic loss [0.012084877, 0.010829683]
critic loss [0.011947934, 0.010886913]
critic loss [0.011713397, 0.010878166]
critic loss [0.011403415, 0.01079735]
actor loss nan
train end 0.5120055675506592
should be 1.0, predicted: [ 0.88980663], target predicted: [ 0.99363279]
start searching new action
get new action:  0.26474547386169434
start getting new_q
get new_q:  0.06758308410644531
train start:
critic loss [0.017075684, 0.010841509]
critic loss [0.015953109, 0.010493524]
critic loss [0.014299301, 0.010026982]
critic loss [0.012458755, 0.0095173437]
critic loss [0.01075758, 0.009044678]
actor loss nan
train end 0.6956479549407959
should be -1.0, predicted: [-1.27376509], target predicted: [-1.2141217]
should be 1.0, predicted: [ 0.92551333], target predicted: [ 1.0517441]
should be 1.0, predicted: [ 1.06563473], target predicted: [ 0.07149766]
star

critic loss [0.017202301, 0.014112555]
critic loss [0.015886461, 0.013473209]
critic loss [0.014416607, 0.012767762]
critic loss [0.013013266, 0.012071978]
actor loss nan
train end 0.6224465370178223
should be 1.0, predicted: [ 0.8835752], target predicted: [ 0.71751237]
start searching new action
get new action:  0.24998998641967773
start getting new_q
get new_q:  0.07146215438842773
train start:
critic loss [0.012424669, 0.012424669]
critic loss [0.012490995, 0.012490995]
critic loss [0.01248241, 0.01248241]
critic loss [0.012377264, 0.012377264]
critic loss [0.012166536, 0.012166536]
actor loss nan
train end 0.753044605255127
start searching new action
get new action:  0.33943867683410645
start getting new_q
get new_q:  0.07298755645751953
train start:
critic loss [0.018259156, 0.016525172]
critic loss [0.0177543, 0.016025383]
critic loss [0.016565656, 0.015085867]
critic loss [0.014938102, 0.01385966]
critic loss [0.013153937, 0.012510527]
actor loss nan
train end 0.872799873352050

critic loss [0.024018675, 0.010093312]
actor loss nan
train end 0.5217230319976807
should be 1.0, predicted: [ 1.01188421], target predicted: [-0.27386737]
should be 1.0, predicted: [ 0.68514282], target predicted: [ 0.6466279]
start searching new action
get new action:  0.26624059677124023
start getting new_q
get new_q:  0.06387591361999512
train start:
critic loss [0.016958036, 0.007942941]
critic loss [0.014326855, 0.0078022182]
critic loss [0.011977915, 0.0076873815]
critic loss [0.010042163, 0.0075892401]
critic loss [0.0088190529, 0.0075161736]
actor loss nan
train end 0.5456094741821289
should be 1.0, predicted: [ 0.99871981], target predicted: [ 1.03329659]
should be 1.0, predicted: [ 1.02739596], target predicted: [ 0.69050181]
should be 1.0, predicted: [ 1.01229322], target predicted: [ 1.05548155]
start searching new action
get new action:  0.2536897659301758
start getting new_q
get new_q:  0.07378745079040527
train start:
critic loss [0.015752994, 0.014942145]
critic loss [

get new action:  0.23469281196594238
start getting new_q
get new_q:  0.07409286499023438
train start:
critic loss [0.0077342065, 0.0077342065]
critic loss [0.0078152474, 0.0078152474]
critic loss [0.007882691, 0.007882691]
critic loss [0.0079066753, 0.0079066753]
critic loss [0.0078743473, 0.0078743473]
actor loss nan
train end 0.54290771484375
start searching new action
get new action:  0.24271082878112793
start getting new_q
get new_q:  0.0672769546508789
train start:
critic loss [0.076455474, 0.018620292]
critic loss [0.068981744, 0.017776325]
critic loss [0.056133889, 0.016305257]
critic loss [0.041709453, 0.014659294]
critic loss [0.028361337, 0.013165153]
actor loss nan
train end 0.5774333477020264
should be 1.0, predicted: [ 1.03706706], target predicted: [ 1.03901577]
should be 1.0, predicted: [ 0.93669373], target predicted: [ 0.59024626]
should be 1.0, predicted: [ 1.07267904], target predicted: [ 1.02157629]
start searching new action
get new action:  0.26234912872314453
sta

actor loss nan
train end 0.7883672714233398
should be 1.0, predicted: [ 1.3002305], target predicted: [ 1.12217915]
should be 1.0, predicted: [ 1.07024455], target predicted: [ 1.03247893]
start searching new action
get new action:  0.2691493034362793
start getting new_q
get new_q:  0.06695938110351562
train start:
critic loss [0.055894092, 0.014997594]
critic loss [0.048369315, 0.014192137]
critic loss [0.03700231, 0.01288174]
critic loss [0.025539376, 0.01149521]
critic loss [0.016682304, 0.01036787]
actor loss nan
train end 0.5804121494293213
should be -1.0, predicted: [-0.79758054], target predicted: [-0.52940416]
start searching new action
get new action:  0.2513766288757324
start getting new_q
get new_q:  0.07132124900817871
train start:
critic loss [0.013896223, 0.013896223]
critic loss [0.01315149, 0.01315149]
critic loss [0.012478998, 0.012478998]
critic loss [0.011898911, 0.011898911]
critic loss [0.011412092, 0.011412092]
actor loss nan
train end 0.6137120723724365
start sea

critic loss [0.011561711, 0.010463831]
critic loss [0.010954721, 0.010284797]
critic loss [0.010371064, 0.010065113]
critic loss [0.0099041145, 0.009824614]
actor loss nan
train end 0.5794515609741211
should be -1.0, predicted: [-0.89719796], target predicted: [-0.82397676]
start searching new action
get new action:  0.2632768154144287
start getting new_q
get new_q:  0.06496143341064453
train start:
critic loss [0.013307304, 0.012684071]
critic loss [0.013064429, 0.012406839]
critic loss [0.012646228, 0.012001327]
critic loss [0.012098666, 0.011507447]
critic loss [0.011483284, 0.010974483]
actor loss nan
train end 0.5907485485076904
should be -1.0, predicted: [-1.12429023], target predicted: [-1.06383777]
Episode : 939 Replay Buffer 50000
TOTAL REWARD @ 939-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.25537919998168945
start getting new_q
get new_q:  0.0642693042755127
train start:
critic loss [0.015014814, 0.014981763]
critic loss [0.014549516,

critic loss [0.015029398, 0.014999976]
critic loss [0.016318738, 0.016266871]
critic loss [0.017224869, 0.017158942]
critic loss [0.017673414, 0.017605755]
actor loss nan
train end 0.5889990329742432
should be 1.0, predicted: [ 0.90616935], target predicted: [ 1.11614871]
start searching new action
get new action:  0.2559316158294678
start getting new_q
get new_q:  0.06511640548706055
train start:
critic loss [0.061428666, 0.021352055]
critic loss [0.055620909, 0.020616852]
critic loss [0.046221308, 0.01902467]
critic loss [0.036032531, 0.017056441]
critic loss [0.027024416, 0.01514614]
actor loss nan
train end 0.5963327884674072
should be 1.0, predicted: [ 1.01710594], target predicted: [ 1.08450603]
should be -1.0, predicted: [-0.61541897], target predicted: [ 0.10709123]
start searching new action
get new action:  0.24400663375854492
start getting new_q
get new_q:  0.06876707077026367
train start:
critic loss [0.066781215, 0.018561065]
critic loss [0.06442371, 0.018456284]
critic lo

get new action:  0.24852466583251953
start getting new_q
get new_q:  0.06725788116455078
train start:
critic loss [0.011172876, 0.009826472]
critic loss [0.010931345, 0.0096099637]
critic loss [0.010686837, 0.0094361939]
critic loss [0.0104486, 0.0093027912]
critic loss [0.010215076, 0.0091971597]
actor loss nan
train end 0.5196952819824219
should be 1.0, predicted: [ 0.83657378], target predicted: [ 1.0262289]
Episode : 943 Replay Buffer 50000
TOTAL REWARD @ 943-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.26215696334838867
start getting new_q
get new_q:  0.07492327690124512
train start:
critic loss [0.012740632, 0.010760872]
critic loss [0.012267839, 0.010662869]
critic loss [0.011568539, 0.010504248]
critic loss [0.010872642, 0.010301786]
critic loss [0.010364322, 0.010071848]
actor loss nan
train end 0.5126552581787109
should be 1.0, predicted: [ 0.5892812], target predicted: [ 0.72467428]
start searching new action
get new action:  0.2351586

critic loss [0.014589365, 0.011811693]
critic loss [0.013355798, 0.011122482]
critic loss [0.011877514, 0.010277761]
critic loss [0.010378554, 0.0093904883]
actor loss nan
train end 0.5211441516876221
should be -1.0, predicted: [-0.99444753], target predicted: [-1.04305875]
should be 1.0, predicted: [ 1.10814798], target predicted: [ 0.99741429]
should be 1.0, predicted: [ 0.96124446], target predicted: [ 0.93969125]
start searching new action
get new action:  0.24815940856933594
start getting new_q
get new_q:  0.06680417060852051
train start:
critic loss [0.025201133, 0.0095379613]
critic loss [0.023771444, 0.0092543466]
critic loss [0.02169488, 0.0090177208]
critic loss [0.019349664, 0.0088555515]
critic loss [0.016951226, 0.0087696966]
actor loss nan
train end 0.5800471305847168
should be 1.0, predicted: [ 0.68858701], target predicted: [ 0.53495938]
should be 1.0, predicted: [ 0.94365102], target predicted: [ 0.95246702]
should be 1.0, predicted: [ 0.96108329], target predicted: [ 

start searching new action
get new action:  0.24998211860656738
start getting new_q
get new_q:  0.06387138366699219
train start:
critic loss [0.011695227, 0.01169478]
critic loss [0.011427764, 0.01142684]
critic loss [0.011118932, 0.011116781]
critic loss [0.010775881, 0.010771235]
critic loss [0.010405265, 0.01039634]
actor loss nan
train end 0.5748963356018066
should be 1.0, predicted: [ 0.99942613], target predicted: [ 1.0856992]
start searching new action
get new action:  0.2792518138885498
start getting new_q
get new_q:  0.08090829849243164
train start:
critic loss [0.021588588, 0.015731078]
critic loss [0.020465987, 0.015274477]
critic loss [0.018903244, 0.014699228]
critic loss [0.017193455, 0.014075804]
critic loss [0.01559836, 0.013467476]
actor loss nan
train end 0.7842564582824707
should be 1.0, predicted: [ 0.94169533], target predicted: [ 1.05616641]
should be 1.0, predicted: [ 0.9853791], target predicted: [ 1.01685262]
should be 1.0, predicted: [ 1.08435357], target pred

get new action:  0.2590317726135254
start getting new_q
get new_q:  0.07086634635925293
train start:
critic loss [0.017877728, 0.011469594]
critic loss [0.017912231, 0.011780303]
critic loss [0.017365171, 0.011830118]
critic loss [0.01639463, 0.011666616]
critic loss [0.015141631, 0.011342941]
actor loss nan
train end 0.506284236907959
should be -1.0, predicted: [-0.55249023], target predicted: [-0.68792534]
should be 1.0, predicted: [ 0.92904085], target predicted: [ 0.76530343]
start searching new action
get new action:  0.2666623592376709
start getting new_q
get new_q:  0.06430625915527344
train start:
critic loss [0.01562098, 0.015610155]
critic loss [0.015525382, 0.015513413]
critic loss [0.015343431, 0.015329694]
critic loss [0.015102033, 0.015086021]
critic loss [0.014810609, 0.014792099]
actor loss nan
train end 0.536674976348877
should be 1.0, predicted: [ 0.97325778], target predicted: [ 0.94375992]
start searching new action
get new action:  0.24176383018493652
start getting

get new action:  0.25424766540527344
start getting new_q
get new_q:  0.07622432708740234
train start:
critic loss [0.010554822, 0.0098380223]
critic loss [0.011046465, 0.010294201]
critic loss [0.011301612, 0.010577649]
critic loss [0.011315892, 0.010673998]
critic loss [0.011111259, 0.010587431]
actor loss nan
train end 0.6869404315948486
should be 1.0, predicted: [ 1.01559341], target predicted: [ 1.05924249]
start searching new action
get new action:  0.2601003646850586
start getting new_q
get new_q:  0.07434320449829102
train start:
critic loss [0.015619622, 0.015328111]
critic loss [0.015172515, 0.014865538]
critic loss [0.014410564, 0.014111206]
critic loss [0.013432026, 0.013159575]
critic loss [0.012342023, 0.012110183]
actor loss nan
train end 0.5536952018737793
should be 1.0, predicted: [ 0.89469969], target predicted: [ 0.95255077]
start searching new action
get new action:  0.2518126964569092
start getting new_q
get new_q:  0.08362579345703125
train start:
critic loss [0.01

actor loss nan
train end 0.6766414642333984
should be -1.0, predicted: [-1.00470674], target predicted: [-1.07546842]
should be 1.0, predicted: [ 0.80944741], target predicted: [ 0.84155017]
start searching new action
get new action:  0.2449963092803955
start getting new_q
get new_q:  0.0699925422668457
train start:
critic loss [0.0080082687, 0.0080030169]
critic loss [0.0079278974, 0.0079162885]
critic loss [0.0080279149, 0.0079590445]
critic loss [0.0082188565, 0.0080702193]
critic loss [0.0084186681, 0.0081943376]
actor loss nan
train end 0.6647143363952637
should be -1.0, predicted: [-1.03560197], target predicted: [-0.99297184]
start searching new action
get new action:  0.24888157844543457
start getting new_q
get new_q:  0.0774235725402832
train start:
critic loss [0.017762255, 0.01140107]
critic loss [0.01721435, 0.011464559]
critic loss [0.015830111, 0.011249701]
critic loss [0.014007133, 0.010826294]
critic loss [0.012142215, 0.010283765]
actor loss nan
train end 0.55934524536

start searching new action
get new action:  0.2949371337890625
start getting new_q
get new_q:  0.0836174488067627
train start:
critic loss [0.012833221, 0.0098633841]
critic loss [0.012532071, 0.009707775]
critic loss [0.011814, 0.0094133671]
critic loss [0.010842992, 0.0090315305]
critic loss [0.0098066404, 0.0086219115]
actor loss nan
train end 0.7911596298217773
should be 1.0, predicted: [ 1.06277359], target predicted: [ 1.05886912]
should be -1.0, predicted: [-0.86318833], target predicted: [-0.9332056]
start searching new action
get new action:  0.2481391429901123
start getting new_q
get new_q:  0.0658717155456543
train start:
critic loss [0.0076065827, 0.0070390641]
critic loss [0.0077110231, 0.0071540065]
critic loss [0.0077556004, 0.0072294711]
critic loss [0.0077367537, 0.0072544925]
critic loss [0.0076555153, 0.0072228275]
actor loss nan
train end 0.6027982234954834
should be 1.0, predicted: [ 0.97735512], target predicted: [ 0.95426279]
should be 1.0, predicted: [ 0.9306180

critic loss [0.008920379, 0.0081478357]
critic loss [0.0088095833, 0.0082736965]
critic loss [0.0087206513, 0.0083773192]
critic loss [0.0086326255, 0.008431715]
actor loss nan
train end 0.5849909782409668
should be 1.0, predicted: [ 1.07768071], target predicted: [ 0.9349463]
should be 1.0, predicted: [ 1.05447686], target predicted: [ 0.93652314]
start searching new action
get new action:  0.24162602424621582
start getting new_q
get new_q:  0.07179498672485352
train start:
critic loss [0.0075316811, 0.0062609641]
critic loss [0.0075659594, 0.0063050278]
critic loss [0.0073732827, 0.0062426743]
critic loss [0.0070073791, 0.0060945093]
critic loss [0.0065471339, 0.0058928961]
actor loss nan
train end 0.5545516014099121
should be 1.0, predicted: [ 1.13136685], target predicted: [ 1.035339]
should be 1.0, predicted: [ 1.17097485], target predicted: [ 1.06266832]
should be 1.0, predicted: [ 1.13869393], target predicted: [ 1.05106318]
should be -1.0, predicted: [-0.73295134], target predi

get new action:  0.2441699504852295
start getting new_q
get new_q:  0.07216787338256836
train start:
critic loss [0.010253223, 0.0097252615]
critic loss [0.0098790471, 0.0096046943]
critic loss [0.0098219374, 0.0096982922]
critic loss [0.010017293, 0.0099235885]
critic loss [0.010357544, 0.010193806]
actor loss nan
train end 0.9674115180969238
should be 1.0, predicted: [ 1.10788071], target predicted: [ 1.00854611]
should be 1.0, predicted: [ 1.12625706], target predicted: [ 1.02953684]
should be 1.0, predicted: [ 1.10634494], target predicted: [ 1.05860257]
should be 1.0, predicted: [ 1.05676031], target predicted: [ 1.06171441]
should be 1.0, predicted: [ 1.05260813], target predicted: [ 1.01923192]
start searching new action
get new action:  0.27365708351135254
start getting new_q
get new_q:  0.07401275634765625
train start:
critic loss [0.015764702, 0.0096935425]
critic loss [0.015418725, 0.0099609531]
critic loss [0.0145312, 0.01002395]
critic loss [0.013316656, 0.0099073835]
crit

Episode : 960 Replay Buffer 50000
TOTAL REWARD @ 960-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2768077850341797
start getting new_q
get new_q:  0.08307170867919922
train start:
critic loss [0.017303165, 0.017140348]
critic loss [0.017025812, 0.0169205]
critic loss [0.016566411, 0.016503094]
critic loss [0.01597257, 0.015937431]
critic loss [0.015290599, 0.015272608]
actor loss nan
train end 0.6197030544281006
should be 1.0, predicted: [ 0.94909286], target predicted: [ 0.90449619]
start searching new action
get new action:  0.2570230960845947
start getting new_q
get new_q:  0.07272577285766602
train start:
critic loss [0.014482496, 0.014438647]
critic loss [0.014098642, 0.014042656]
critic loss [0.013575193, 0.013513697]
critic loss [0.012938607, 0.012879109]
critic loss [0.012217551, 0.012166668]
actor loss nan
train end 0.5452778339385986
should be 1.0, predicted: [ 1.0491004], target predicted: [ 1.04132247]
start searching new action
get n

critic loss [0.0088221468, 0.008216057]
critic loss [0.0085940491, 0.0080449525]
critic loss [0.0083398297, 0.0078597041]
critic loss [0.008079946, 0.0076719178]
actor loss nan
train end 0.5962088108062744
should be 1.0, predicted: [ 1.13010359], target predicted: [ 1.04282045]
should be 1.0, predicted: [ 0.99938953], target predicted: [ 0.88672262]
should be 1.0, predicted: [ 0.88607925], target predicted: [ 1.09476435]
Episode : 962 Replay Buffer 50000
TOTAL REWARD @ 962-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.27007627487182617
start getting new_q
get new_q:  0.07982254028320312
train start:
critic loss [0.017604321, 0.0085416418]
critic loss [0.015920751, 0.0081911627]
critic loss [0.013550973, 0.0077513903]
critic loss [0.011156313, 0.0073164687]
critic loss [0.0091496166, 0.0069509088]
actor loss nan
train end 0.6991763114929199
should be 1.0, predicted: [ 1.01177859], target predicted: [ 1.01180208]
should be 1.0, predicted: [ 1.060124

get new action:  0.2683720588684082
start getting new_q
get new_q:  0.08951377868652344
train start:
critic loss [0.020163994, 0.0085922284]
critic loss [0.019209893, 0.0083940038]
critic loss [0.017635049, 0.0081490707]
critic loss [0.015677297, 0.007875165]
critic loss [0.013597345, 0.0075937584]
actor loss nan
train end 0.605719804763794
should be -1.0, predicted: [-1.15670764], target predicted: [-1.10292768]
should be -1.0, predicted: [-0.74127793], target predicted: [-0.63037127]
start searching new action
get new action:  0.24479031562805176
start getting new_q
get new_q:  0.06423425674438477
train start:
critic loss [0.012078542, 0.0089949882]
critic loss [0.011926492, 0.0089300964]
critic loss [0.011365434, 0.0087303659]
critic loss [0.010516952, 0.0084288726]
critic loss [0.0095526213, 0.0080745835]
actor loss nan
train end 0.8518538475036621
should be 1.0, predicted: [ 1.00344813], target predicted: [ 0.98472798]
should be 1.0, predicted: [ 1.01711011], target predicted: [ 1

critic loss [0.006438328, 0.0062128063]
actor loss nan
train end 0.5255155563354492
should be 1.0, predicted: [ 0.95524627], target predicted: [ 0.95061505]
should be 1.0, predicted: [ 1.03672612], target predicted: [ 1.08645725]
should be 1.0, predicted: [ 1.00505292], target predicted: [ 1.00814867]
start searching new action
get new action:  0.24423623085021973
start getting new_q
get new_q:  0.08461213111877441
train start:
critic loss [0.0084862411, 0.0084490692]
critic loss [0.0086753331, 0.008605862]
critic loss [0.0088293236, 0.0087337196]
critic loss [0.0088986726, 0.0087898821]
critic loss [0.0088631948, 0.0087565789]
actor loss nan
train end 0.6243302822113037
should be 1.0, predicted: [ 0.89520031], target predicted: [ 0.92429167]
start searching new action
get new action:  0.2321929931640625
start getting new_q
get new_q:  0.06826353073120117
train start:
critic loss [0.0087761078, 0.0076863049]
critic loss [0.0085878987, 0.0075637358]
critic loss [0.0081069609, 0.00724229

critic loss [0.010452207, 0.01044479]
critic loss [0.010101108, 0.010098082]
critic loss [0.0096812779, 0.0096804332]
critic loss [0.0092599113, 0.0092598367]
critic loss [0.0088858195, 0.0088857757]
actor loss nan
train end 0.6818275451660156
should be 1.0, predicted: [ 1.06107116], target predicted: [ 1.07494092]
start searching new action
get new action:  0.2500913143157959
start getting new_q
get new_q:  0.06409239768981934
train start:
critic loss [0.010637375, 0.010502881]
critic loss [0.010793002, 0.010664023]
critic loss [0.010752307, 0.010638979]
critic loss [0.010507759, 0.010415626]
critic loss [0.010085629, 0.010014627]
actor loss nan
train end 0.5540568828582764
should be -1.0, predicted: [-0.98410624], target predicted: [-1.06206751]
should be 1.0, predicted: [ 1.0192703], target predicted: [ 1.01765418]
Episode : 968 Replay Buffer 50000
TOTAL REWARD @ 968-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2641870975494385
start getting n

get new action:  0.26497960090637207
start getting new_q
get new_q:  0.07387971878051758
train start:
critic loss [0.011202889, 0.011011451]
critic loss [0.01077427, 0.010612349]
critic loss [0.01025747, 0.010136586]
critic loss [0.0097059999, 0.0096274577]
critic loss [0.0091613671, 0.0091173602]
actor loss nan
train end 0.7732746601104736
should be 1.0, predicted: [ 0.93843758], target predicted: [ 1.00678301]
should be 1.0, predicted: [ 0.91344911], target predicted: [ 1.06349981]
start searching new action
get new action:  0.23959088325500488
start getting new_q
get new_q:  0.07471466064453125
train start:
critic loss [0.0086872187, 0.0084507409]
critic loss [0.008537434, 0.0083645871]
critic loss [0.0083276648, 0.0082127564]
critic loss [0.0080723139, 0.0080050128]
critic loss [0.0077931532, 0.007760671]
actor loss nan
train end 0.5688598155975342
should be 1.0, predicted: [ 0.93634295], target predicted: [ 0.98160464]
trained action prob map predicted by initial model for a start

critic loss [0.0094483001, 0.0071523814]
critic loss [0.0087293983, 0.007021327]
critic loss [0.0079367692, 0.0068904078]
critic loss [0.0072541209, 0.0067695607]
actor loss nan
train end 0.5623950958251953
should be 1.0, predicted: [ 1.00501013], target predicted: [ 1.00800526]
should be 1.0, predicted: [ 1.07496202], target predicted: [ 0.98272419]
start searching new action
get new action:  0.23167872428894043
start getting new_q
get new_q:  0.06121540069580078
train start:
critic loss [0.007872656, 0.007872656]
critic loss [0.0078682732, 0.0078682732]
critic loss [0.0078094951, 0.0078094951]
critic loss [0.0077000889, 0.0077000889]
critic loss [0.0075468719, 0.0075468719]
actor loss nan
train end 0.5579197406768799
start searching new action
get new action:  0.2323317527770996
start getting new_q
get new_q:  0.06966328620910645
train start:
critic loss [0.0095324554, 0.0081408601]
critic loss [0.0090281498, 0.0077748015]
critic loss [0.0083492743, 0.0073126969]
critic loss [0.00760

get new action:  0.24095726013183594
start getting new_q
get new_q:  0.06193208694458008
train start:
critic loss [0.0062843249, 0.0062843249]
critic loss [0.0063421335, 0.0063421335]
critic loss [0.0063698292, 0.0063698292]
critic loss [0.0063674506, 0.0063674506]
critic loss [0.0063363062, 0.0063363062]
actor loss nan
train end 0.5258512496948242
start searching new action
get new action:  0.23796844482421875
start getting new_q
get new_q:  0.064117431640625
train start:
critic loss [0.010371042, 0.0092204548]
critic loss [0.010048965, 0.0090457313]
critic loss [0.0094983662, 0.0086798072]
critic loss [0.0088325981, 0.0082085878]
critic loss [0.0081571238, 0.0077107185]
actor loss nan
train end 0.5297451019287109
should be 1.0, predicted: [ 1.06949675], target predicted: [ 1.06461978]
should be 1.0, predicted: [ 1.04030645], target predicted: [ 0.79649699]
should be 1.0, predicted: [ 1.10845208], target predicted: [ 1.05092061]
start searching new action
get new action:  0.2409300804

critic loss [0.012161115, 0.011573046]
critic loss [0.011737093, 0.01121714]
critic loss [0.011240518, 0.010808092]
critic loss [0.010703197, 0.01036589]
actor loss nan
train end 0.5210027694702148
should be 1.0, predicted: [ 0.92885411], target predicted: [ 0.82293999]
start searching new action
get new action:  0.25215935707092285
start getting new_q
get new_q:  0.061597585678100586
train start:
critic loss [0.011604288, 0.0083020292]
critic loss [0.011462456, 0.0082102884]
critic loss [0.01102091, 0.0080396254]
critic loss [0.010354741, 0.0078066834]
critic loss [0.0095595643, 0.0075361961]
actor loss nan
train end 0.48175764083862305
should be -1.0, predicted: [-0.85364991], target predicted: [-0.84853667]
should be 1.0, predicted: [ 0.95020807], target predicted: [ 0.96658194]
Episode : 976 Replay Buffer 50000
TOTAL REWARD @ 976-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2536945343017578
start getting new_q
get new_q:  0.08555436134338379


get new action:  0.2428278923034668
start getting new_q
get new_q:  0.06808209419250488
train start:
critic loss [0.0076633599, 0.0076633599]
critic loss [0.0076145623, 0.0076145623]
critic loss [0.0073356316, 0.0073356316]
critic loss [0.0068964576, 0.0068964576]
critic loss [0.0063802013, 0.0063802013]
actor loss nan
train end 0.5364530086517334
Episode : 978 Replay Buffer 50000
TOTAL REWARD @ 978-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.3459203243255615
start getting new_q
get new_q:  0.10555672645568848
train start:
critic loss [0.11051122, 0.017192302]
critic loss [0.10223357, 0.016162453]
critic loss [0.088522807, 0.014662648]
critic loss [0.071577489, 0.013003465]
critic loss [0.052906659, 0.011351978]
actor loss nan
train end 0.8242416381835938
should be 1.0, predicted: [ 0.6809482], target predicted: [ 0.96337742]
should be -1.0, predicted: [-0.23439233], target predicted: [ 0.69609779]
start searching new action
get new action:  0.2

critic loss [0.021760127, 0.014803268]
actor loss nan
train end 0.5297286510467529
should be 1.0, predicted: [ 1.14071786], target predicted: [ 1.14117062]
should be 1.0, predicted: [ 0.94099677], target predicted: [ 0.85433066]
should be 1.0, predicted: [ 0.774266], target predicted: [ 0.85495424]
trained action prob map predicted by initial model for a starting game
[[[ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]]

 [[ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]]]
Episode : 980 Replay Buffer 50000
TOTAL REWARD @ 980-th Episode  : Reward 1
Tot

get new action:  0.24464654922485352
start getting new_q
get new_q:  0.08893966674804688
train start:
critic loss [0.011922426, 0.010155431]
critic loss [0.011609083, 0.010015696]
critic loss [0.010986406, 0.0097565353]
critic loss [0.01022182, 0.0094279461]
critic loss [0.0094796699, 0.0090782419]
actor loss nan
train end 0.7858443260192871
should be 1.0, predicted: [ 1.045578], target predicted: [ 1.05204713]
should be 1.0, predicted: [ 1.07626247], target predicted: [ 1.0901078]
Episode : 982 Replay Buffer 50000
TOTAL REWARD @ 982-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2721710205078125
start getting new_q
get new_q:  0.06907486915588379
train start:
critic loss [0.010142727, 0.010142727]
critic loss [0.010053081, 0.010053081]
critic loss [0.0098736454, 0.0098736454]
critic loss [0.0096068941, 0.0096068941]
critic loss [0.0092604924, 0.0092604924]
actor loss nan
train end 0.71108078956604
start searching new action
get new action:  0.2370

critic loss [0.023061549, 0.012523265]
critic loss [0.020160545, 0.011716224]
critic loss [0.016506214, 0.010561149]
critic loss [0.012969328, 0.0093177902]
actor loss nan
train end 0.5342705249786377
should be -1.0, predicted: [-1.03194857], target predicted: [-1.06335926]
should be -1.0, predicted: [-0.90405774], target predicted: [-0.91989619]
should be 1.0, predicted: [ 1.17529857], target predicted: [ 1.1004113]
should be 1.0, predicted: [ 0.96894491], target predicted: [ 0.86366421]
should be 1.0, predicted: [ 1.01056576], target predicted: [ 0.97457594]
Episode : 984 Replay Buffer 50000
TOTAL REWARD @ 984-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.25922656059265137
start getting new_q
get new_q:  0.06603574752807617
train start:
critic loss [0.0068881242, 0.0067956196]
critic loss [0.0074772993, 0.0073631881]
critic loss [0.0080198236, 0.007899968]
critic loss [0.0084452201, 0.0083353762]
critic loss [0.0087136244, 0.0086254068]
actor lo

get new action:  0.24273085594177246
start getting new_q
get new_q:  0.07248806953430176
train start:
critic loss [0.024767548, 0.013466181]
critic loss [0.022611264, 0.012908589]
critic loss [0.019285508, 0.012142563]
critic loss [0.015739359, 0.011366899]
critic loss [0.01278487, 0.01074169]
actor loss nan
train end 0.49856019020080566
should be 1.0, predicted: [ 1.06557584], target predicted: [ 0.83995456]
should be 1.0, predicted: [ 0.58024627], target predicted: [ 0.42841664]
should be 1.0, predicted: [ 1.03108585], target predicted: [ 0.97621483]
start searching new action
get new action:  0.2436389923095703
start getting new_q
get new_q:  0.06964230537414551
train start:
critic loss [0.097746618, 0.017489668]
critic loss [0.075051174, 0.014950339]
critic loss [0.047068126, 0.011922779]
critic loss [0.02478325, 0.0096033281]
critic loss [0.0137018, 0.0085679442]
actor loss nan
train end 0.6230754852294922
should be -1.0, predicted: [-0.70192349], target predicted: [ 0.59895223]
s

critic loss [0.0077653956, 0.0077653956]
actor loss nan
train end 0.9015719890594482
start searching new action
get new action:  0.2660489082336426
start getting new_q
get new_q:  0.06837987899780273
train start:
critic loss [0.011238923, 0.010917582]
critic loss [0.011264456, 0.010968415]
critic loss [0.011104216, 0.010834714]
critic loss [0.010781055, 0.010537983]
critic loss [0.010340674, 0.010122473]
actor loss nan
train end 0.6676843166351318
should be 1.0, predicted: [ 0.84210432], target predicted: [ 0.8594051]
Episode : 988 Replay Buffer 50000
TOTAL REWARD @ 988-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2587621212005615
start getting new_q
get new_q:  0.07708239555358887
train start:
critic loss [0.0087376405, 0.0086731017]
critic loss [0.0084815081, 0.0084249759]
critic loss [0.0082381247, 0.0081967432]
critic loss [0.0080227368, 0.0079985326]
critic loss [0.0078407899, 0.0078305341]
actor loss nan
train end 0.7345507144927979
should 

get new action:  0.25751423835754395
start getting new_q
get new_q:  0.07304501533508301
train start:
critic loss [0.012320422, 0.012242851]
critic loss [0.01229161, 0.012176785]
critic loss [0.012292404, 0.012148226]
critic loss [0.012276899, 0.012116669]
critic loss [0.012209624, 0.012049165]
actor loss nan
train end 0.6530623435974121
should be -1.0, predicted: [-0.84194183], target predicted: [-1.05881417]
start searching new action
get new action:  0.24335741996765137
start getting new_q
get new_q:  0.0628509521484375
train start:
critic loss [0.0094683934, 0.0088930167]
critic loss [0.009535904, 0.0089099733]
critic loss [0.0093809813, 0.0088088484]
critic loss [0.0090491734, 0.0086076269]
critic loss [0.0086141406, 0.0083325934]
actor loss nan
train end 0.5663862228393555
should be 1.0, predicted: [ 0.95464081], target predicted: [ 1.08615613]
should be 1.0, predicted: [ 1.01576126], target predicted: [ 1.14436376]
should be 1.0, predicted: [ 0.95794624], target predicted: [ 1.1

critic loss [0.0090933889, 0.0090675456]
actor loss nan
train end 0.5862321853637695
should be -1.0, predicted: [-0.99204266], target predicted: [-1.09029424]
start searching new action
get new action:  0.25982666015625
start getting new_q
get new_q:  0.06606197357177734
train start:
critic loss [0.011076638, 0.009776691]
critic loss [0.011044168, 0.009879075]
critic loss [0.010848014, 0.0098804729]
critic loss [0.010527492, 0.0097853765]
critic loss [0.010123435, 0.0096046636]
actor loss nan
train end 0.7142047882080078
should be -1.0, predicted: [-0.84950632], target predicted: [-0.79569072]
start searching new action
get new action:  0.26499104499816895
start getting new_q
get new_q:  0.06565070152282715
train start:
critic loss [0.0084191058, 0.0084191058]
critic loss [0.0082653798, 0.0082653798]
critic loss [0.0079946164, 0.0079946164]
critic loss [0.0076431325, 0.0076431325]
critic loss [0.0072466061, 0.0072466061]
actor loss nan
train end 0.7918729782104492
start searching new a

critic loss [0.0063322335, 0.0062415753]
actor loss nan
train end 0.5799624919891357
should be 1.0, predicted: [ 0.98300773], target predicted: [ 0.78212112]
start searching new action
get new action:  0.23546099662780762
start getting new_q
get new_q:  0.06309914588928223
train start:
critic loss [0.00853475, 0.0083104344]
critic loss [0.0082725352, 0.0081116501]
critic loss [0.0080069229, 0.007911426]
critic loss [0.0077746259, 0.0077276989]
critic loss [0.0075881029, 0.0075633815]
actor loss nan
train end 0.6307041645050049
should be 1.0, predicted: [ 1.05721712], target predicted: [ 1.01774633]
should be 1.0, predicted: [ 1.00376892], target predicted: [ 1.01649404]
start searching new action
get new action:  0.25051259994506836
start getting new_q
get new_q:  0.07083344459533691
train start:
critic loss [0.015613913, 0.0088993544]
critic loss [0.015224101, 0.0090423189]
critic loss [0.014342776, 0.0089997686]
critic loss [0.013108738, 0.0087868758]
critic loss [0.011681655, 0.0084

get new action:  0.24247074127197266
start getting new_q
get new_q:  0.06190943717956543
train start:
critic loss [0.01460425, 0.0078838132]
critic loss [0.014280398, 0.0078466525]
critic loss [0.013595248, 0.0077409614]
critic loss [0.012644985, 0.0075778374]
critic loss [0.011538587, 0.007373021]
actor loss nan
train end 0.5179383754730225
should be 1.0, predicted: [ 0.76498663], target predicted: [ 0.57651001]
should be 1.0, predicted: [ 1.18283558], target predicted: [ 0.9886108]
start searching new action
get new action:  0.2337782382965088
start getting new_q
get new_q:  0.06079220771789551
train start:
critic loss [0.011990817, 0.0088207861]
critic loss [0.011629201, 0.0087646488]
critic loss [0.010959828, 0.0085750762]
critic loss [0.010109658, 0.0082942545]
critic loss [0.0092157144, 0.0079711769]
actor loss nan
train end 0.5214765071868896
should be 1.0, predicted: [ 1.19803405], target predicted: [ 1.13935757]
should be 1.0, predicted: [ 1.1267364], target predicted: [ 0.933

critic loss [0.02090279, 0.011972511]
critic loss [0.018881584, 0.011294327]
critic loss [0.016355446, 0.010457512]
critic loss [0.01376002, 0.0096134348]
actor loss nan
train end 0.637845516204834
should be -1.0, predicted: [-0.7776159], target predicted: [-0.37197757]
start searching new action
get new action:  0.2334883213043213
start getting new_q
get new_q:  0.06337523460388184
train start:
critic loss [0.025117077, 0.0081966799]
critic loss [0.023759704, 0.0081383381]
critic loss [0.021716937, 0.0080992263]
critic loss [0.019240715, 0.0080501121]
critic loss [0.016608015, 0.0079734279]
actor loss nan
train end 0.5824992656707764
should be -1.0, predicted: [-0.7319566], target predicted: [-0.30018875]
should be -1.0, predicted: [-1.31575751], target predicted: [-1.25423372]
should be 1.0, predicted: [ 0.86442709], target predicted: [ 0.97248995]
Episode : 998 Replay Buffer 50000
TOTAL REWARD @ 998-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0

critic loss [0.0074432702, 0.0074432702]
critic loss [0.0072122505, 0.0072122505]
critic loss [0.006989032, 0.006989032]
critic loss [0.0067805918, 0.0067805918]
actor loss nan
train end 0.515979528427124
start searching new action
get new action:  0.2525012493133545
start getting new_q
get new_q:  0.0864412784576416
train start:
critic loss [0.022750197, 0.0094720479]
critic loss [0.0215776, 0.0094665065]
critic loss [0.019602273, 0.0093507096]
critic loss [0.01720836, 0.0091576818]
critic loss [0.014761929, 0.0089227976]
actor loss nan
train end 0.5411343574523926
should be -1.0, predicted: [-0.7930336], target predicted: [-0.39395994]
should be -1.0, predicted: [-1.25557792], target predicted: [-1.21448898]
trained action prob map predicted by initial model for a starting game
[[[ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  

get new action:  0.23539471626281738
start getting new_q
get new_q:  0.09430146217346191
train start:
critic loss [0.012957882, 0.010676894]
critic loss [0.013963188, 0.011612764]
critic loss [0.014433697, 0.01224895]
critic loss [0.014403671, 0.01256622]
critic loss [0.013955235, 0.012573764]
actor loss nan
train end 0.5841121673583984
should be -1.0, predicted: [-0.97769594], target predicted: [-0.93600905]
start searching new action
get new action:  0.23701786994934082
start getting new_q
get new_q:  0.06337571144104004
train start:
critic loss [0.0096230563, 0.0094559323]
critic loss [0.0097117359, 0.0095644994]
critic loss [0.0094763683, 0.0093562277]
critic loss [0.0089871362, 0.008896362]
critic loss [0.0083442051, 0.0082805203]
actor loss nan
train end 0.5234677791595459
should be 1.0, predicted: [ 1.0949533], target predicted: [ 0.9836337]
should be 1.0, predicted: [ 1.0279417], target predicted: [ 0.97025907]
start searching new action
get new action:  0.2530395984649658
star

get new action:  0.2402491569519043
start getting new_q
get new_q:  0.06328248977661133
train start:
critic loss [0.014186947, 0.013554527]
critic loss [0.013347012, 0.012779381]
critic loss [0.012426051, 0.011968814]
critic loss [0.011528645, 0.011203486]
critic loss [0.01073549, 0.0105388]
actor loss nan
train end 0.5799810886383057
should be 1.0, predicted: [ 0.99352908], target predicted: [ 0.99712324]
should be 1.0, predicted: [ 1.01001143], target predicted: [ 1.00403666]
start searching new action
get new action:  0.24007844924926758
start getting new_q
get new_q:  0.06813621520996094
train start:
critic loss [0.0080606211, 0.0074971388]
critic loss [0.0075959628, 0.0070124236]
critic loss [0.0071896487, 0.0066237836]
critic loss [0.0068154177, 0.006311838]
critic loss [0.0064630574, 0.0060551837]
actor loss nan
train end 0.5478568077087402
should be 1.0, predicted: [ 0.97811216], target predicted: [ 0.94997036]
should be 1.0, predicted: [ 1.02034009], target predicted: [ 1.0290

critic loss [0.0081456071, 0.0081456071]
critic loss [0.0078248046, 0.0078248046]
actor loss nan
train end 0.8086552619934082
start searching new action
get new action:  0.23543763160705566
start getting new_q
get new_q:  0.07563948631286621
train start:
critic loss [0.017870856, 0.012244355]
critic loss [0.016211212, 0.011607404]
critic loss [0.013903179, 0.010708992]
critic loss [0.011630265, 0.0097423531]
critic loss [0.0098726582, 0.008859775]
actor loss nan
train end 0.7055115699768066
should be -1.0, predicted: [-1.04473186], target predicted: [-0.94565523]
should be 1.0, predicted: [ 1.0420388], target predicted: [ 1.04782712]
should be 1.0, predicted: [ 0.89445168], target predicted: [ 0.9638142]
should be 1.0, predicted: [ 1.00211716], target predicted: [ 0.99911773]
should be 1.0, predicted: [ 1.01114142], target predicted: [ 0.54080606]
should be 1.0, predicted: [ 1.08186114], target predicted: [ 1.11470389]
start searching new action
get new action:  0.2327742576599121
star

get new action:  0.2330925464630127
start getting new_q
get new_q:  0.06757950782775879
train start:
critic loss [0.0092919543, 0.0060408683]
critic loss [0.0086445976, 0.0059359893]
critic loss [0.0076707439, 0.0057907342]
critic loss [0.0068209739, 0.0056505622]
critic loss [0.0062191677, 0.0055272728]
actor loss nan
train end 0.508242130279541
should be -1.0, predicted: [-0.98509008], target predicted: [-0.94675756]
should be 1.0, predicted: [ 0.94091153], target predicted: [ 0.57897514]
should be 1.0, predicted: [ 1.02296853], target predicted: [ 0.87038392]
should be 1.0, predicted: [ 1.07162476], target predicted: [ 1.02943277]
start searching new action
get new action:  0.23421263694763184
start getting new_q
get new_q:  0.06587862968444824
train start:
critic loss [0.0074151312, 0.0074151312]
critic loss [0.0074051102, 0.0074051102]
critic loss [0.007320134, 0.007320134]
critic loss [0.0071725808, 0.0071725808]
critic loss [0.0069740713, 0.0069740713]
actor loss nan
train end 0

critic loss [0.0063208123, 0.0063181906]
actor loss nan
train end 0.5325403213500977
should be 1.0, predicted: [ 0.84644383], target predicted: [ 0.98853242]
start searching new action
get new action:  0.22902798652648926
start getting new_q
get new_q:  0.06818199157714844
train start:
critic loss [0.011569703, 0.010158542]
critic loss [0.010988071, 0.0097222496]
critic loss [0.010073975, 0.0090745101]
critic loss [0.0089814104, 0.0082986467]
critic loss [0.0078681912, 0.0074827392]
actor loss nan
train end 0.5185019969940186
should be 1.0, predicted: [ 0.93464017], target predicted: [ 0.89437789]
start searching new action
get new action:  0.2332296371459961
start getting new_q
get new_q:  0.07165646553039551
train start:
critic loss [0.038631454, 0.016795874]
critic loss [0.03678662, 0.016382916]
critic loss [0.032412384, 0.015463967]
critic loss [0.026632717, 0.014224242]
critic loss [0.020646974, 0.012838039]
actor loss nan
train end 0.534102201461792
should be -1.0, predicted: [-0

get new action:  0.23465657234191895
start getting new_q
get new_q:  0.06449127197265625
train start:
critic loss [0.0095679257, 0.0095679257]
critic loss [0.0090474579, 0.0090474579]
critic loss [0.0087351482, 0.0087351482]
critic loss [0.0085904058, 0.0085904058]
critic loss [0.0085627018, 0.0085627018]
actor loss nan
train end 0.560760498046875
start searching new action
get new action:  0.2357344627380371
start getting new_q
get new_q:  0.06314396858215332
train start:
critic loss [0.018299151, 0.016322616]
critic loss [0.018387781, 0.016473405]
critic loss [0.017643947, 0.015940523]
critic loss [0.016286671, 0.014892468]
critic loss [0.014562227, 0.013520999]
actor loss nan
train end 0.5432970523834229
should be 1.0, predicted: [ 0.89968008], target predicted: [ 0.98819005]
start searching new action
get new action:  0.2326982021331787
start getting new_q
get new_q:  0.07009077072143555
train start:
critic loss [0.010905651, 0.010794593]
critic loss [0.010688663, 0.010620848]
crit

critic loss [0.012260742, 0.0096180905]
critic loss [0.011339648, 0.0096459752]
actor loss nan
train end 0.717397928237915
should be -1.0, predicted: [-0.7281878], target predicted: [-0.45426643]
start searching new action
get new action:  0.2378864288330078
start getting new_q
get new_q:  0.06295990943908691
train start:
critic loss [0.010686491, 0.010668746]
critic loss [0.010399123, 0.010380112]
critic loss [0.0098408768, 0.0098230243]
critic loss [0.0091168778, 0.0091019953]
critic loss [0.0083313286, 0.0083203763]
actor loss nan
train end 0.5881187915802002
should be 1.0, predicted: [ 0.98106104], target predicted: [ 1.0568707]
start searching new action
get new action:  0.2249155044555664
start getting new_q
get new_q:  0.06809616088867188
train start:
critic loss [0.005711318, 0.0055604097]
critic loss [0.0060981875, 0.0059680687]
critic loss [0.0062769195, 0.0061663599]
critic loss [0.0062203836, 0.0061273305]
critic loss [0.0059529478, 0.0058751116]
actor loss nan
train end 0.

critic loss [0.011895448, 0.0056430344]
critic loss [0.010285055, 0.005369206]
critic loss [0.0083043836, 0.0049488302]
critic loss [0.0064396011, 0.0044836244]
actor loss nan
train end 0.5931046009063721
should be 1.0, predicted: [ 1.06526303], target predicted: [ 1.03309345]
should be 1.0, predicted: [ 1.00652742], target predicted: [ 1.02722323]
should be 1.0, predicted: [ 1.06363058], target predicted: [ 1.06262779]
should be 1.0, predicted: [ 0.88542992], target predicted: [ 0.52313101]
should be 1.0, predicted: [ 1.01527166], target predicted: [ 1.04323602]
start searching new action
get new action:  0.25884294509887695
start getting new_q
get new_q:  0.07200908660888672
train start:
critic loss [0.0074247168, 0.0074247168]
critic loss [0.0071257814, 0.0071257814]
critic loss [0.0068568583, 0.0068568583]
critic loss [0.0066341227, 0.0066341227]
critic loss [0.0064597838, 0.0064597838]
actor loss nan
train end 0.643622875213623
start searching new action
get new action:  0.2479469

critic loss [0.014000941, 0.010178577]
critic loss [0.013379939, 0.010191085]
critic loss [0.012550278, 0.010032171]
critic loss [0.011567486, 0.0097128078]
actor loss nan
train end 0.6449360847473145
should be 1.0, predicted: [ 1.17476845], target predicted: [ 1.06396019]
should be -1.0, predicted: [-0.76088262], target predicted: [-0.40443546]
should be -1.0, predicted: [-1.15478957], target predicted: [-0.91261327]
start searching new action
get new action:  0.26398205757141113
start getting new_q
get new_q:  0.07120919227600098
train start:
critic loss [0.025287971, 0.010791183]
critic loss [0.020829445, 0.0098750163]
critic loss [0.014654835, 0.0085451081]
critic loss [0.0094770603, 0.0073261289]
critic loss [0.0070024258, 0.0065653538]
actor loss nan
train end 0.5965640544891357
should be 1.0, predicted: [ 0.98349369], target predicted: [ 0.98855019]
should be 1.0, predicted: [ 1.01868892], target predicted: [ 0.98926347]
should be 1.0, predicted: [ 1.13014984], target predicted:

get new action:  0.2381906509399414
start getting new_q
get new_q:  0.06418108940124512
train start:
critic loss [0.023123713, 0.011015074]
critic loss [0.021754999, 0.010780194]
critic loss [0.019419666, 0.010292035]
critic loss [0.016606964, 0.0096581168]
critic loss [0.013654828, 0.008967177]
actor loss nan
train end 0.5235097408294678
should be 1.0, predicted: [ 0.85716462], target predicted: [ 0.58764541]
should be 1.0, predicted: [ 0.89349234], target predicted: [ 1.02751005]
start searching new action
get new action:  0.23850274085998535
start getting new_q
get new_q:  0.07113385200500488
train start:
critic loss [0.007474971, 0.0074700294]
critic loss [0.0072849928, 0.0072847954]
critic loss [0.0071437852, 0.0071426723]
critic loss [0.0070318836, 0.0070269783]
critic loss [0.0069268895, 0.0069177849]
actor loss nan
train end 0.5284969806671143
should be 1.0, predicted: [ 0.9582628], target predicted: [ 1.06783533]
start searching new action
get new action:  0.22697782516479492


get new action:  0.24538588523864746
start getting new_q
get new_q:  0.07117700576782227
train start:
critic loss [0.0060165864, 0.0056981151]
critic loss [0.0062141754, 0.0059243813]
critic loss [0.0063171294, 0.0060635209]
critic loss [0.0062999241, 0.0060868626]
critic loss [0.0061607212, 0.0059892973]
actor loss nan
train end 0.5927278995513916
should be 1.0, predicted: [ 0.87444454], target predicted: [ 0.93732202]
should be 1.0, predicted: [ 0.98619825], target predicted: [ 1.06257451]
start searching new action
get new action:  0.23195791244506836
start getting new_q
get new_q:  0.06850194931030273
train start:
critic loss [0.0084464569, 0.0084464569]
critic loss [0.0082432348, 0.0082432348]
critic loss [0.0079283798, 0.0079283798]
critic loss [0.0075378725, 0.0075378725]
critic loss [0.007105886, 0.007105886]
actor loss nan
train end 0.5991270542144775
start searching new action
get new action:  0.25723743438720703
start getting new_q
get new_q:  0.07596230506896973
train start

get new action:  0.26151037216186523
start getting new_q
get new_q:  0.08131742477416992
train start:
critic loss [0.0080031846, 0.0077447677]
critic loss [0.0078313621, 0.0075436486]
critic loss [0.007615143, 0.0073062107]
critic loss [0.0073596328, 0.0070398934]
critic loss [0.0070725451, 0.0067535415]
actor loss nan
train end 0.824368953704834
should be 1.0, predicted: [ 1.11300838], target predicted: [ 1.00935805]
should be -1.0, predicted: [-1.14824927], target predicted: [-1.27270961]
start searching new action
get new action:  0.29691600799560547
start getting new_q
get new_q:  0.06596946716308594
train start:
critic loss [0.01339066, 0.013227982]
critic loss [0.013131307, 0.012972954]
critic loss [0.012651499, 0.012509668]
critic loss [0.012013553, 0.011896163]
critic loss [0.01129001, 0.011200813]
actor loss nan
train end 0.6716430187225342
should be 1.0, predicted: [ 1.13861108], target predicted: [ 1.06007814]
start searching new action
get new action:  0.23719573020935059
s

get new action:  0.25530171394348145
start getting new_q
get new_q:  0.06950163841247559
train start:
critic loss [0.0051450133, 0.0040667141]
critic loss [0.0049455352, 0.0039117485]
critic loss [0.0046712914, 0.0037680271]
critic loss [0.0043575, 0.0036374489]
critic loss [0.004040801, 0.003519699]
actor loss nan
train end 0.5708432197570801
should be 1.0, predicted: [ 1.01117361], target predicted: [ 1.01009583]
should be 1.0, predicted: [ 1.0168587], target predicted: [ 1.04864967]
should be 1.0, predicted: [ 1.09823954], target predicted: [ 1.04549289]
start searching new action
get new action:  0.23811793327331543
start getting new_q
get new_q:  0.062271833419799805
train start:
critic loss [0.021505378, 0.0089120548]
critic loss [0.019818487, 0.0088222325]
critic loss [0.01662066, 0.008437302]
critic loss [0.013125004, 0.0079450943]
critic loss [0.010367647, 0.0075011817]
actor loss nan
train end 0.5610783100128174
should be -1.0, predicted: [-0.88902164], target predicted: [-1.

critic loss [0.0087704938, 0.0085114148]
actor loss nan
train end 0.5764420032501221
should be 1.0, predicted: [ 1.02082312], target predicted: [ 1.04185307]
start searching new action
get new action:  0.24519920349121094
start getting new_q
get new_q:  0.08684730529785156
train start:
critic loss [0.011188578, 0.01111569]
critic loss [0.010953506, 0.010889877]
critic loss [0.010480819, 0.010435594]
critic loss [0.0098649869, 0.0098400936]
critic loss [0.0091802403, 0.0091716526]
actor loss nan
train end 0.5172770023345947
should be 1.0, predicted: [ 0.96254981], target predicted: [ 0.99730152]
should be 1.0, predicted: [ 0.96254981], target predicted: [ 0.99730152]
start searching new action
get new action:  0.23961114883422852
start getting new_q
get new_q:  0.06344985961914062
train start:
critic loss [0.01714851, 0.016369998]
critic loss [0.016515393, 0.015801396]
critic loss [0.01574835, 0.015152808]
critic loss [0.01491786, 0.014470332]
critic loss [0.01407771, 0.013779994]
actor

start searching new action
get new action:  0.24427199363708496
start getting new_q
get new_q:  0.06522345542907715
train start:
critic loss [0.0061257612, 0.0051092179]
critic loss [0.006203277, 0.0051991725]
critic loss [0.006115986, 0.0051951613]
critic loss [0.0058653699, 0.0050815381]
critic loss [0.0054841712, 0.004868207]
actor loss nan
train end 0.665062427520752
should be 1.0, predicted: [ 0.91244143], target predicted: [ 1.00404942]
start searching new action
get new action:  0.2512328624725342
start getting new_q
get new_q:  0.06830430030822754
train start:
critic loss [0.023617767, 0.0101497]
critic loss [0.02210471, 0.0099840928]
critic loss [0.019846147, 0.0097403303]
critic loss [0.017248236, 0.0094674621]
critic loss [0.014698454, 0.0092051513]
actor loss nan
train end 0.5827715396881104
should be 1.0, predicted: [ 0.57011509], target predicted: [ 0.58767951]
should be -1.0, predicted: [-1.16993725], target predicted: [-0.6821332]
start searching new action
get new acti

critic loss [0.005145777, 0.0044261431]
critic loss [0.0050263354, 0.0043496513]
critic loss [0.0048260153, 0.0042250641]
critic loss [0.0045710392, 0.0040664915]
actor loss nan
train end 0.6524064540863037
should be 1.0, predicted: [ 1.09824681], target predicted: [ 1.04509175]
start searching new action
get new action:  0.2325742244720459
start getting new_q
get new_q:  0.06217789649963379
train start:
critic loss [0.0045025386, 0.0042151958]
critic loss [0.0044316514, 0.0041377214]
critic loss [0.0043283501, 0.0040371474]
critic loss [0.0041968757, 0.00391781]
critic loss [0.0040450925, 0.0037869532]
actor loss nan
train end 0.5449306964874268
should be 1.0, predicted: [ 0.91378427], target predicted: [ 0.89443231]
should be 1.0, predicted: [ 1.00532627], target predicted: [ 1.00434923]
start searching new action
get new action:  0.2182755470275879
start getting new_q
get new_q:  0.06970572471618652
train start:
critic loss [0.05651037, 0.0088828877]
critic loss [0.050849091, 0.0083

get new action:  0.2259361743927002
start getting new_q
get new_q:  0.0659639835357666
train start:
critic loss [0.0069810762, 0.0069810762]
critic loss [0.0066428799, 0.0066428799]
critic loss [0.0062365467, 0.0062365467]
critic loss [0.0057798857, 0.0057798857]
critic loss [0.0052911025, 0.0052911025]
actor loss nan
train end 0.5106260776519775
start searching new action
get new action:  0.2274768352508545
start getting new_q
get new_q:  0.07197022438049316
train start:
critic loss [0.00709596, 0.0055390853]
critic loss [0.0060639251, 0.005352255]
critic loss [0.005200326, 0.0050444338]
critic loss [0.0047732927, 0.0047131404]
critic loss [0.0047484683, 0.0044128504]
actor loss nan
train end 0.6392827033996582
should be -1.0, predicted: [-1.13155377], target predicted: [-1.15724301]
should be 1.0, predicted: [ 1.0806663], target predicted: [ 0.99678737]
should be 1.0, predicted: [ 1.02306592], target predicted: [ 1.02219331]
start searching new action
get new action:  0.2353913784027

get new action:  0.22324299812316895
start getting new_q
get new_q:  0.06487870216369629
train start:
critic loss [0.0096392762, 0.0096033122]
critic loss [0.0096664745, 0.009634098]
critic loss [0.0094671808, 0.0094407592]
critic loss [0.0090772882, 0.0090580164]
critic loss [0.0085490411, 0.0085368864]
actor loss nan
train end 0.547508716583252
should be 1.0, predicted: [ 1.104846], target predicted: [ 1.03072143]
start searching new action
get new action:  0.23911523818969727
start getting new_q
get new_q:  0.06473398208618164
train start:
critic loss [0.0091360733, 0.0091360584]
critic loss [0.0087318635, 0.0087318365]
critic loss [0.008257689, 0.008257363]
critic loss [0.0077602873, 0.0077591632]
critic loss [0.0072724875, 0.0072699999]
actor loss nan
train end 0.5319547653198242
should be 1.0, predicted: [ 1.00273776], target predicted: [ 0.95504636]
start searching new action
get new action:  0.2345428466796875
start getting new_q
get new_q:  0.06164908409118652
train start:
cri

actor loss nan
train end 0.7523832321166992
start searching new action
get new action:  0.24690461158752441
start getting new_q
get new_q:  0.06941676139831543
train start:
critic loss [0.00635619, 0.0048505068]
critic loss [0.0056007602, 0.0045201634]
critic loss [0.0047389008, 0.004159444]
critic loss [0.004025057, 0.0038287695]
critic loss [0.0036099446, 0.0035658437]
actor loss nan
train end 0.5334115028381348
should be 1.0, predicted: [ 0.98161072], target predicted: [ 1.04792881]
should be 1.0, predicted: [ 0.74270844], target predicted: [ 0.94347137]
start searching new action
get new action:  0.23933792114257812
start getting new_q
get new_q:  0.060932159423828125
train start:
critic loss [0.0051653674, 0.0045988117]
critic loss [0.0054185009, 0.0047243941]
critic loss [0.0055223256, 0.0047668857]
critic loss [0.0054609887, 0.0047191023]
critic loss [0.0052533057, 0.0045916676]
actor loss nan
train end 0.5395858287811279
should be 1.0, predicted: [ 1.03476679], target predicted

get new action:  0.24114465713500977
start getting new_q
get new_q:  0.06466817855834961
train start:
critic loss [0.0069367047, 0.0064193374]
critic loss [0.0067625782, 0.0063563325]
critic loss [0.0065139248, 0.0062259082]
critic loss [0.0062250122, 0.0060446784]
critic loss [0.0059257206, 0.0058303168]
actor loss nan
train end 0.5337827205657959
should be 1.0, predicted: [ 1.03079438], target predicted: [ 1.11220539]
should be -1.0, predicted: [-0.88412923], target predicted: [-0.87899435]
start searching new action
get new action:  0.2506999969482422
start getting new_q
get new_q:  0.07344388961791992
train start:
critic loss [0.020052664, 0.0092254598]
critic loss [0.018906161, 0.00916145]
critic loss [0.016928766, 0.0090050073]
critic loss [0.014584263, 0.0087819658]
critic loss [0.012404108, 0.0085266773]
actor loss nan
train end 0.7337472438812256
should be -1.0, predicted: [-0.73154324], target predicted: [-0.528108]
should be -1.0, predicted: [-0.59474891], target predicted: 

get new action:  0.2683737277984619
start getting new_q
get new_q:  0.07525491714477539
train start:
critic loss [0.008937737, 0.0088749602]
critic loss [0.0084330691, 0.0083846785]
critic loss [0.0077427607, 0.007712821]
critic loss [0.0069891466, 0.0069750682]
critic loss [0.0062814504, 0.0062760613]
actor loss nan
train end 0.8552546501159668
should be 1.0, predicted: [ 0.8417505], target predicted: [ 0.97748464]
should be 1.0, predicted: [ 1.07609797], target predicted: [ 1.03457403]
start searching new action
get new action:  0.2403409481048584
start getting new_q
get new_q:  0.06544995307922363
train start:
critic loss [0.0084681306, 0.0084681306]
critic loss [0.0090111047, 0.0090111047]
critic loss [0.0092099607, 0.0092099607]
critic loss [0.0090506393, 0.0090506393]
critic loss [0.0085839918, 0.0085839918]
actor loss nan
train end 0.5978519916534424
start searching new action
get new action:  0.22875118255615234
start getting new_q
get new_q:  0.07413554191589355
train start:
c

get new action:  0.23791790008544922
start getting new_q
get new_q:  0.06051492691040039
train start:
critic loss [0.0062991688, 0.0062991688]
critic loss [0.0063242405, 0.0063242405]
critic loss [0.0060927672, 0.0060927672]
critic loss [0.005647067, 0.005647067]
critic loss [0.0050584874, 0.0050584874]
actor loss nan
train end 0.5182533264160156
start searching new action
get new action:  0.23372220993041992
start getting new_q
get new_q:  0.06323409080505371
train start:
critic loss [0.018284868, 0.017354257]
critic loss [0.016681692, 0.015780188]
critic loss [0.014545745, 0.013773739]
critic loss [0.012264402, 0.011682442]
critic loss [0.010174671, 0.0097941831]
actor loss nan
train end 0.5246231555938721
should be 1.0, predicted: [ 1.03700554], target predicted: [ 1.01290214]
should be 1.0, predicted: [ 1.03598928], target predicted: [ 1.02032912]
start searching new action
get new action:  0.23341083526611328
start getting new_q
get new_q:  0.06372666358947754
train start:
critic 

critic loss [0.0060231723, 0.0060213967]
critic loss [0.0063199783, 0.0063063852]
critic loss [0.0065564755, 0.0064967284]
critic loss [0.006652968, 0.0065393215]
actor loss nan
train end 0.5457382202148438
should be 1.0, predicted: [ 0.95903528], target predicted: [ 1.04695618]
start searching new action
get new action:  0.22431635856628418
start getting new_q
get new_q:  0.06450295448303223
train start:
critic loss [0.010346045, 0.0091153812]
critic loss [0.010143715, 0.0088352142]
critic loss [0.0095420722, 0.0083224596]
critic loss [0.0086965971, 0.0076902602]
critic loss [0.0077836886, 0.0070509501]
actor loss nan
train end 0.5160171985626221
should be -1.0, predicted: [-1.01282489], target predicted: [-0.86941612]
should be 1.0, predicted: [ 1.04043007], target predicted: [ 1.03217375]
should be 1.0, predicted: [ 0.96344471], target predicted: [ 1.01942289]
should be 1.0, predicted: [ 0.9841944], target predicted: [ 1.03847325]
start searching new action
get new action:  0.246938

critic loss [0.0046125222, 0.004343369]
actor loss nan
train end 0.517160177230835
should be 1.0, predicted: [ 1.03481817], target predicted: [ 1.0255847]
should be 1.0, predicted: [ 1.08037293], target predicted: [ 1.03637445]
should be 1.0, predicted: [ 0.98172063], target predicted: [ 1.02760649]
should be -1.0, predicted: [-0.96476132], target predicted: [-0.98712909]
start searching new action
get new action:  0.22238874435424805
start getting new_q
get new_q:  0.06534957885742188
train start:
critic loss [0.0067772311, 0.0067770518]
critic loss [0.0067762472, 0.0067760786]
critic loss [0.0066972133, 0.0066968109]
critic loss [0.0065407967, 0.0065397201]
critic loss [0.0063139978, 0.0063115684]
actor loss nan
train end 0.5157918930053711
should be 1.0, predicted: [ 1.12022507], target predicted: [ 1.08980596]
start searching new action
get new action:  0.2246849536895752
start getting new_q
get new_q:  0.07291507720947266
train start:
critic loss [0.004388819, 0.004387225]
critic 

critic loss [0.0049025714, 0.0045627491]
actor loss nan
train end 0.5274920463562012
should be 1.0, predicted: [ 0.98086065], target predicted: [ 0.98876601]
should be 1.0, predicted: [ 1.04232121], target predicted: [ 1.02360249]
should be 1.0, predicted: [ 1.08545697], target predicted: [ 1.10868919]
start searching new action
get new action:  0.23445606231689453
start getting new_q
get new_q:  0.07056379318237305
train start:
critic loss [0.0052308529, 0.0039486755]
critic loss [0.0053939838, 0.0041350927]
critic loss [0.0052363444, 0.0041751135]
critic loss [0.0048281047, 0.0040659411]
critic loss [0.0042936588, 0.0038361184]
actor loss nan
train end 0.5564415454864502
should be -1.0, predicted: [-1.10006261], target predicted: [-1.12071741]
should be 1.0, predicted: [ 0.99770319], target predicted: [ 0.99263614]
should be 1.0, predicted: [ 0.99500972], target predicted: [ 1.01941895]
should be -1.0, predicted: [-0.95944363], target predicted: [-0.79082805]
start searching new acti

critic loss [0.0045650518, 0.0045650518]
critic loss [0.0042315004, 0.0042315004]
critic loss [0.0039109215, 0.0039109215]
critic loss [0.0036258977, 0.0036258977]
actor loss nan
train end 0.5405035018920898
start searching new action
get new action:  0.22398781776428223
start getting new_q
get new_q:  0.06707334518432617
train start:
critic loss [0.004759172, 0.0047585694]
critic loss [0.0048239208, 0.0048230356]
critic loss [0.0048279185, 0.0048211813]
critic loss [0.0047623748, 0.0047454489]
critic loss [0.0046271882, 0.0045976215]
actor loss nan
train end 0.6408731937408447
should be 1.0, predicted: [ 1.0359695], target predicted: [ 1.03766406]
start searching new action
get new action:  0.24329733848571777
start getting new_q
get new_q:  0.07638978958129883
train start:
critic loss [0.0096519301, 0.0058335257]
critic loss [0.008785041, 0.0056160456]
critic loss [0.0074840467, 0.0053027435]
critic loss [0.0062008793, 0.0049639489]
critic loss [0.0052404548, 0.0046510203]
actor loss

get new action:  0.2287919521331787
start getting new_q
get new_q:  0.06479120254516602
train start:
critic loss [0.093103155, 0.017123934]
critic loss [0.077216193, 0.015970971]
critic loss [0.055086929, 0.014109408]
critic loss [0.034883823, 0.012461003]
critic loss [0.021281788, 0.011490867]
actor loss nan
train end 0.5332231521606445
should be 1.0, predicted: [ 0.86336023], target predicted: [ 0.5454495]
should be 1.0, predicted: [ 0.7161454], target predicted: [ 0.50354558]
should be 1.0, predicted: [ 0.94422436], target predicted: [ 1.01694012]
should be -1.0, predicted: [-0.93547606], target predicted: [ 0.18921848]
start searching new action
get new action:  0.22728753089904785
start getting new_q
get new_q:  0.07146072387695312
train start:
critic loss [0.023728862, 0.023728862]
critic loss [0.023317415, 0.023317415]
critic loss [0.02235062, 0.02235062]
critic loss [0.021094358, 0.021094358]
critic loss [0.019679232, 0.019679232]
actor loss nan
train end 0.6075656414031982
sta

critic loss [0.0070587569, 0.0070587569]
critic loss [0.00703455, 0.00703455]
critic loss [0.0069788937, 0.0069788937]
critic loss [0.0068859421, 0.0068859421]
actor loss nan
train end 0.5010726451873779
start searching new action
get new action:  0.2217116355895996
start getting new_q
get new_q:  0.06041598320007324
train start:
critic loss [0.052159924, 0.010367717]
critic loss [0.046324968, 0.0097377142]
critic loss [0.036595184, 0.0085693337]
critic loss [0.026283925, 0.0072880005]
critic loss [0.017006243, 0.0061391564]
actor loss nan
train end 0.5171291828155518
should be -1.0, predicted: [-1.07904196], target predicted: [-1.13313925]
should be 1.0, predicted: [ 0.98166138], target predicted: [ 1.01455963]
should be 1.0, predicted: [ 1.05172896], target predicted: [ 1.01798487]
should be 1.0, predicted: [ 1.05172896], target predicted: [ 1.01798487]
should be 1.0, predicted: [ 1.05172896], target predicted: [ 1.01798487]
should be -1.0, predicted: [-0.63098902], target predicted:

critic loss [0.014702291, 0.0094101913]
critic loss [0.01232991, 0.0086459005]
critic loss [0.0098927757, 0.007838198]
critic loss [0.0079164673, 0.0070947655]
actor loss nan
train end 0.6298458576202393
should be 1.0, predicted: [ 0.80425733], target predicted: [ 0.66666389]
start searching new action
get new action:  0.25472140312194824
start getting new_q
get new_q:  0.06705689430236816
train start:
critic loss [0.0056839036, 0.004450921]
critic loss [0.005521548, 0.0044391179]
critic loss [0.0053098961, 0.0044572949]
critic loss [0.0050879661, 0.004491339]
critic loss [0.0048866114, 0.004526577]
actor loss nan
train end 0.5730597972869873
should be 1.0, predicted: [ 1.04052234], target predicted: [ 1.09907722]
should be 1.0, predicted: [ 1.05533934], target predicted: [ 1.03053629]
start searching new action
get new action:  0.22269821166992188
start getting new_q
get new_q:  0.06745076179504395
train start:
critic loss [0.010814948, 0.010790278]
critic loss [0.010867069, 0.0108611

get new action:  0.2206554412841797
start getting new_q
get new_q:  0.06285572052001953
train start:
critic loss [0.0087130032, 0.0066303462]
critic loss [0.0084084282, 0.0064635063]
critic loss [0.0075420989, 0.0061289524]
critic loss [0.0064561549, 0.0057041007]
critic loss [0.0055082981, 0.0052708136]
actor loss nan
train end 0.5121850967407227
should be 1.0, predicted: [ 1.06451333], target predicted: [ 0.99127805]
should be 1.0, predicted: [ 1.06850171], target predicted: [ 1.01392341]
should be 1.0, predicted: [ 1.03098595], target predicted: [ 0.99902707]
should be 1.0, predicted: [ 1.14937019], target predicted: [ 1.01858902]
start searching new action
get new action:  0.2393946647644043
start getting new_q
get new_q:  0.07020401954650879
train start:
critic loss [0.0076345978, 0.0076345978]
critic loss [0.0072720237, 0.0072720237]
critic loss [0.0069442503, 0.0069442503]
critic loss [0.0066529629, 0.0066529629]
critic loss [0.0063962112, 0.0063962112]
actor loss nan
train end 

critic loss [0.011860216, 0.010564119]
critic loss [0.011455359, 0.010413993]
critic loss [0.010646627, 0.0098832641]
critic loss [0.0096569294, 0.0091128377]
actor loss nan
train end 0.5603010654449463
should be 1.0, predicted: [ 1.12074673], target predicted: [ 0.98564255]
should be 1.0, predicted: [ 1.10477412], target predicted: [ 0.99501449]
should be 1.0, predicted: [ 0.92597896], target predicted: [ 1.00073743]
start searching new action
get new action:  0.24483275413513184
start getting new_q
get new_q:  0.06768417358398438
train start:
critic loss [0.0082621183, 0.0082518067]
critic loss [0.0079033729, 0.007900985]
critic loss [0.0075937579, 0.0075601782]
critic loss [0.0073108231, 0.0072233668]
critic loss [0.0070332782, 0.0068867421]
actor loss nan
train end 0.528376579284668
should be 1.0, predicted: [ 1.0023762], target predicted: [ 1.00152278]
start searching new action
get new action:  0.2334749698638916
start getting new_q
get new_q:  0.07532072067260742
train start:
cr

critic loss [0.023218043, 0.010353946]
critic loss [0.020127779, 0.0098856594]
critic loss [0.016746061, 0.0094354972]
critic loss [0.013648048, 0.0090658478]
actor loss nan
train end 0.5018856525421143
should be 1.0, predicted: [ 0.99638987], target predicted: [ 0.99986356]
should be 1.0, predicted: [ 1.08864236], target predicted: [ 1.12132573]
should be 1.0, predicted: [ 0.89151758], target predicted: [ 0.88649565]
start searching new action
get new action:  0.22804975509643555
start getting new_q
get new_q:  0.06390762329101562
train start:
critic loss [0.0058689225, 0.0057991268]
critic loss [0.0056766998, 0.0056219003]
critic loss [0.0053878962, 0.0053478414]
critic loss [0.0050251372, 0.0049980418]
critic loss [0.0046185604, 0.004601826]
actor loss nan
train end 0.534888505935669
should be 1.0, predicted: [ 1.00554037], target predicted: [ 1.07258904]
start searching new action
get new action:  0.2236948013305664
start getting new_q
get new_q:  0.0614781379699707
train start:
cr

get new action:  0.25989627838134766
start getting new_q
get new_q:  0.0756826400756836
train start:
critic loss [0.0039845956, 0.0036647101]
critic loss [0.0039794939, 0.0036978719]
critic loss [0.0040094014, 0.0037745291]
critic loss [0.0040344186, 0.0038479869]
critic loss [0.0040223747, 0.0038797825]
actor loss nan
train end 0.6043498516082764
should be 1.0, predicted: [ 0.93564349], target predicted: [ 0.97077692]
should be 1.0, predicted: [ 0.91612625], target predicted: [ 0.99439305]
start searching new action
get new action:  0.23754525184631348
start getting new_q
get new_q:  0.07045769691467285
train start:
critic loss [0.0088142511, 0.0065273172]
critic loss [0.0080626523, 0.0063418457]
critic loss [0.0070603518, 0.0060432861]
critic loss [0.0061177621, 0.0057006143]
critic loss [0.0054368023, 0.0053670714]
actor loss nan
train end 0.6255693435668945
should be 1.0, predicted: [ 0.96060938], target predicted: [ 1.10488069]
start searching new action
get new action:  0.2286615

critic loss [0.0053559737, 0.0047269883]
critic loss [0.0051918337, 0.0046335952]
critic loss [0.0050065825, 0.0045453515]
critic loss [0.0048062941, 0.0044509396]
actor loss nan
train end 0.5924239158630371
should be 1.0, predicted: [ 1.01375782], target predicted: [ 1.01161468]
start searching new action
get new action:  0.22562170028686523
start getting new_q
get new_q:  0.0665884017944336
train start:
critic loss [0.0059939222, 0.0034991426]
critic loss [0.0056086173, 0.003413734]
critic loss [0.0048902375, 0.0032164352]
critic loss [0.0040361555, 0.0029586039]
critic loss [0.0032426887, 0.0026947651]
actor loss nan
train end 0.5111076831817627
should be -1.0, predicted: [-1.0473423], target predicted: [-1.21009171]
should be 1.0, predicted: [ 0.98795229], target predicted: [ 1.06396389]
start searching new action
get new action:  0.22785592079162598
start getting new_q
get new_q:  0.06374454498291016
train start:
critic loss [0.088496849, 0.012396216]
critic loss [0.074696131, 0.0

critic loss [0.009598121, 0.0092278151]
critic loss [0.0092232237, 0.0088612307]
critic loss [0.008581765, 0.0082467217]
critic loss [0.0077779214, 0.0074845064]
actor loss nan
train end 0.530602216720581
should be 1.0, predicted: [ 1.10088181], target predicted: [ 1.04034841]
start searching new action
get new action:  0.22531342506408691
start getting new_q
get new_q:  0.06660962104797363
train start:
critic loss [0.010983177, 0.0083230194]
critic loss [0.010129607, 0.007819307]
critic loss [0.0091039203, 0.0073274709]
critic loss [0.0080911033, 0.0069051255]
critic loss [0.0072428542, 0.0065800087]
actor loss nan
train end 0.6620237827301025
should be 1.0, predicted: [ 0.88417023], target predicted: [ 0.86842179]
trained action prob map predicted by initial model for a starting game
[[[ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan  nan  nan  nan  nan  nan  nan]
  [ nan 

critic loss [0.015131282, 0.015131282]
critic loss [0.014133863, 0.014133863]
critic loss [0.012833897, 0.012833897]
critic loss [0.011444713, 0.011444713]
actor loss nan
train end 0.5113685131072998
start searching new action
get new action:  0.23148775100708008
start getting new_q
get new_q:  0.06482863426208496
train start:
critic loss [0.0056932494, 0.0051545682]
critic loss [0.0055669802, 0.0051716268]
critic loss [0.0054834178, 0.0052277669]
critic loss [0.0054124338, 0.0052729188]
critic loss [0.0053217388, 0.0052645653]
actor loss nan
train end 0.5176520347595215
should be 1.0, predicted: [ 1.01844668], target predicted: [ 1.01694405]
should be 1.0, predicted: [ 1.04320586], target predicted: [ 1.02265882]
start searching new action
get new action:  0.21889638900756836
start getting new_q
get new_q:  0.06566834449768066
train start:
critic loss [0.0069590844, 0.0065349741]
critic loss [0.0070105609, 0.006482149]
critic loss [0.0066535082, 0.0061335461]
critic loss [0.00598688, 

critic loss [0.0037179373, 0.0037179356]
actor loss nan
train end 0.5574419498443604
should be 1.0, predicted: [ 0.98751175], target predicted: [ 1.0289706]
start searching new action
get new action:  0.23593544960021973
start getting new_q
get new_q:  0.0688791275024414
train start:
critic loss [0.0046817753, 0.0046817753]
critic loss [0.0046242746, 0.0046242746]
critic loss [0.0045453305, 0.0045453305]
critic loss [0.004450412, 0.004450412]
critic loss [0.0043429006, 0.0043429006]
actor loss nan
train end 0.7329249382019043
start searching new action
get new action:  0.23342156410217285
start getting new_q
get new_q:  0.07219457626342773
train start:
critic loss [0.0045693894, 0.0039317897]
critic loss [0.0046615019, 0.0040440112]
critic loss [0.0045324285, 0.004052707]
critic loss [0.00424287, 0.0039547272]
critic loss [0.0038914019, 0.0037735221]
actor loss nan
train end 0.6647157669067383
should be 1.0, predicted: [ 0.97924709], target predicted: [ 1.04897702]
should be 1.0, predi

critic loss [0.0080191549, 0.0077773202]
actor loss nan
train end 0.5120649337768555
should be 1.0, predicted: [ 1.07643533], target predicted: [ 1.00886941]
should be 1.0, predicted: [ 1.0002358], target predicted: [ 1.02873611]
should be 1.0, predicted: [ 0.94605243], target predicted: [ 0.99386722]
should be 1.0, predicted: [ 1.03392148], target predicted: [ 1.02874386]
should be 1.0, predicted: [ 1.0002358], target predicted: [ 1.02873611]
start searching new action
get new action:  0.21850323677062988
start getting new_q
get new_q:  0.06593489646911621
train start:
critic loss [0.0066904062, 0.0066876048]
critic loss [0.0063380697, 0.0063271001]
critic loss [0.0060426723, 0.0060200072]
critic loss [0.005802053, 0.0057664178]
critic loss [0.0056064785, 0.0055588051]
actor loss nan
train end 0.5182688236236572
should be 1.0, predicted: [ 1.00852585], target predicted: [ 1.04163051]
start searching new action
get new action:  0.21369314193725586
start getting new_q
get new_q:  0.0634

critic loss [0.0043066395, 0.0041314056]
critic loss [0.0040824972, 0.0039719259]
critic loss [0.0038662215, 0.0038089184]
critic loss [0.003662785, 0.0036417474]
actor loss nan
train end 0.552253246307373
should be 1.0, predicted: [ 0.95218837], target predicted: [ 1.03049362]
start searching new action
get new action:  0.22870087623596191
start getting new_q
get new_q:  0.06855940818786621
train start:
critic loss [0.0057675913, 0.0051283021]
critic loss [0.0056728469, 0.0050001917]
critic loss [0.0053511262, 0.0047261668]
critic loss [0.0048795976, 0.004363657]
critic loss [0.0043508131, 0.0039765816]
actor loss nan
train end 0.5627176761627197
should be 1.0, predicted: [ 0.95531189], target predicted: [ 1.00630033]
should be 1.0, predicted: [ 0.97671527], target predicted: [ 1.02569878]
start searching new action
get new action:  0.22554731369018555
start getting new_q
get new_q:  0.07497048377990723
train start:
critic loss [0.0039261407, 0.0029774972]
critic loss [0.0036950412, 0

critic loss [0.0036288202, 0.0034113172]
actor loss nan
train end 0.5039749145507812
should be 1.0, predicted: [ 1.04724765], target predicted: [ 0.99756241]
should be 1.0, predicted: [ 0.98539823], target predicted: [ 1.03132486]
start searching new action
get new action:  0.2251269817352295
start getting new_q
get new_q:  0.0665283203125
train start:
critic loss [0.0034819993, 0.0026210027]
critic loss [0.0032891063, 0.0026008778]
critic loss [0.0030854011, 0.0025669821]
critic loss [0.0029045411, 0.0025262507]
critic loss [0.0027590012, 0.0024825179]
actor loss nan
train end 0.5285277366638184
should be 1.0, predicted: [ 1.04037821], target predicted: [ 1.02033317]
should be -1.0, predicted: [-0.99854189], target predicted: [-1.04920506]
start searching new action
get new action:  0.2378227710723877
start getting new_q
get new_q:  0.06403994560241699
train start:
critic loss [0.0040120226, 0.0039456268]
critic loss [0.0040428862, 0.0039946381]
critic loss [0.0039745923, 0.0039419741

critic loss [0.0028392528, 0.0022696056]
actor loss nan
train end 0.51308274269104
should be 1.0, predicted: [ 1.0900147], target predicted: [ 1.12376451]
should be 1.0, predicted: [ 0.93552697], target predicted: [ 1.02730262]
start searching new action
get new action:  0.2267143726348877
start getting new_q
get new_q:  0.06612157821655273
train start:
critic loss [0.0045471182, 0.0044402382]
critic loss [0.0043935217, 0.0043142089]
critic loss [0.0042277034, 0.0041682506]
critic loss [0.0040590498, 0.0040113274]
critic loss [0.0038951887, 0.0038523269]
actor loss nan
train end 0.5151629447937012
should be 1.0, predicted: [ 0.97345001], target predicted: [ 1.02685773]
should be 1.0, predicted: [ 1.00294578], target predicted: [ 1.03287709]
start searching new action
get new action:  0.22641515731811523
start getting new_q
get new_q:  0.06387639045715332
train start:
critic loss [0.0038483322, 0.0038471301]
critic loss [0.0037275825, 0.0037252675]
critic loss [0.0035995713, 0.003596333

critic loss [0.0047585312, 0.0047403816]
critic loss [0.0045008082, 0.0044775503]
critic loss [0.0041874917, 0.0041611865]
critic loss [0.0038623, 0.0038353968]
actor loss nan
train end 0.6815664768218994
should be 1.0, predicted: [ 0.95726216], target predicted: [ 1.01312029]
start searching new action
get new action:  0.23348379135131836
start getting new_q
get new_q:  0.07091832160949707
train start:
critic loss [0.011390276, 0.0095768422]
critic loss [0.010096929, 0.0086832736]
critic loss [0.0086011086, 0.0076542562]
critic loss [0.0072676088, 0.0066673351]
critic loss [0.006292148, 0.0058327317]
actor loss nan
train end 0.9720718860626221
should be 1.0, predicted: [ 0.99918467], target predicted: [ 0.96550202]
should be 1.0, predicted: [ 1.11919296], target predicted: [ 1.0054661]
should be 1.0, predicted: [ 1.09559596], target predicted: [ 1.02883887]
should be 1.0, predicted: [ 1.01001227], target predicted: [ 1.01533723]
start searching new action
get new action:  0.2314279079

critic loss [0.0069107399, 0.004543365]
critic loss [0.0069305394, 0.0046846331]
critic loss [0.0066966824, 0.0048171906]
critic loss [0.0062530567, 0.0048637586]
actor loss nan
train end 0.6034374237060547
should be 1.0, predicted: [ 0.93736839], target predicted: [ 1.00897682]
should be 1.0, predicted: [ 1.01936805], target predicted: [ 1.0716368]
should be 1.0, predicted: [ 0.97109729], target predicted: [ 1.03540432]
should be 1.0, predicted: [ 1.07918382], target predicted: [ 1.13564074]
should be 1.0, predicted: [ 0.92990369], target predicted: [ 1.01401997]
start searching new action
get new action:  0.2698781490325928
start getting new_q
get new_q:  0.07516670227050781
train start:
critic loss [0.0042437091, 0.0040309252]
critic loss [0.0039929571, 0.0037865925]
critic loss [0.0036556763, 0.0034378734]
critic loss [0.0032777123, 0.0030322333]
critic loss [0.0029034219, 0.0026208654]
actor loss nan
train end 0.6458652019500732
should be 1.0, predicted: [ 1.11734748], target pred

critic loss [0.0084818918, 0.0033226886]
critic loss [0.0042807041, 0.0030199785]
actor loss nan
train end 0.6283845901489258
should be -1.0, predicted: [-0.9501096], target predicted: [-0.02643309]
should be 1.0, predicted: [ 1.02914715], target predicted: [ 1.01581669]
should be 1.0, predicted: [ 1.08577752], target predicted: [ 1.04788446]
should be 1.0, predicted: [ 1.01773286], target predicted: [ 1.01862204]
should be 1.0, predicted: [ 1.12210536], target predicted: [ 1.02110529]
start searching new action
get new action:  0.23638415336608887
start getting new_q
get new_q:  0.07251715660095215
train start:
critic loss [0.0050396328, 0.0047497111]
critic loss [0.0052698385, 0.0049514165]
critic loss [0.005379254, 0.0050819367]
critic loss [0.0053912085, 0.0051498073]
critic loss [0.0053356634, 0.0051675206]
actor loss nan
train end 0.5605118274688721
should be 1.0, predicted: [ 1.03476357], target predicted: [ 1.0399189]
should be 1.0, predicted: [ 1.00485456], target predicted: [

critic loss [0.0041249478, 0.0041249478]
critic loss [0.0040128455, 0.0040128455]
critic loss [0.0038919719, 0.0038919719]
critic loss [0.0037657754, 0.0037657754]
actor loss nan
train end 0.5973095893859863
start searching new action
get new action:  0.22223329544067383
start getting new_q
get new_q:  0.06060290336608887
train start:
critic loss [0.006596949, 0.0050877407]
critic loss [0.006371099, 0.0049984134]
critic loss [0.0059799361, 0.00485365]
critic loss [0.005502033, 0.0046703019]
critic loss [0.0050183525, 0.0044670217]
actor loss nan
train end 0.5663747787475586
should be 1.0, predicted: [ 0.86274564], target predicted: [ 1.00406873]
should be 1.0, predicted: [ 0.97150666], target predicted: [ 1.004614]
should be 1.0, predicted: [ 0.87023562], target predicted: [ 0.74009722]
start searching new action
get new action:  0.22482562065124512
start getting new_q
get new_q:  0.0771334171295166
train start:
critic loss [0.0047935736, 0.004752514]
critic loss [0.0046452624, 0.00462

critic loss [0.0038533215, 0.0034542102]
actor loss nan
train end 0.5904905796051025
should be 1.0, predicted: [ 1.03460932], target predicted: [ 1.02898693]
should be 1.0, predicted: [ 1.03460932], target predicted: [ 1.02898693]
start searching new action
get new action:  0.2219705581665039
start getting new_q
get new_q:  0.06646728515625
train start:
critic loss [0.0080973739, 0.0055777747]
critic loss [0.0076077888, 0.0055521186]
critic loss [0.0069253221, 0.0054162797]
critic loss [0.0061658937, 0.0051832739]
critic loss [0.0054289373, 0.0048745712]
actor loss nan
train end 0.5347187519073486
should be 1.0, predicted: [ 1.06789875], target predicted: [ 1.00349927]
should be 1.0, predicted: [ 0.92123133], target predicted: [ 0.89496595]
start searching new action
get new action:  0.22188830375671387
start getting new_q
get new_q:  0.06070065498352051
train start:
critic loss [0.0054001589, 0.0051019243]
critic loss [0.0051791412, 0.0048538521]
critic loss [0.0048313309, 0.004514615

critic loss [0.0046395329, 0.0046395329]
critic loss [0.0039899321, 0.0039899321]
critic loss [0.003443406, 0.003443406]
critic loss [0.0030196826, 0.0030196826]
actor loss nan
train end 0.5687150955200195
start searching new action
get new action:  0.23444914817810059
start getting new_q
get new_q:  0.07218313217163086
train start:
critic loss [0.0047877771, 0.0047877771]
critic loss [0.0046329293, 0.0046329293]
critic loss [0.0044813841, 0.0044813841]
critic loss [0.0043259654, 0.0043259654]
critic loss [0.0041661211, 0.0041661211]
actor loss nan
train end 0.5553913116455078
Episode : 1094 Replay Buffer 50000
TOTAL REWARD @ 1094-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2665529251098633
start getting new_q
get new_q:  0.07173728942871094
train start:
critic loss [0.0070776455, 0.0070776455]
critic loss [0.0069090631, 0.0069090631]
critic loss [0.0064055952, 0.0064055952]
critic loss [0.0056943996, 0.0056943996]
critic loss [0.0049037603, 0.0

critic loss [0.0044772597, 0.0042019067]
critic loss [0.0043023182, 0.0040693567]
critic loss [0.0040982757, 0.0039133532]
critic loss [0.0038839567, 0.0037466181]
actor loss nan
train end 0.6811444759368896
should be 1.0, predicted: [ 1.03716969], target predicted: [ 1.01765323]
start searching new action
get new action:  0.23079228401184082
start getting new_q
get new_q:  0.06550002098083496
train start:
critic loss [0.011554599, 0.011554599]
critic loss [0.011436542, 0.011436542]
critic loss [0.010740858, 0.010740858]
critic loss [0.0096263131, 0.0096263131]
critic loss [0.0082812775, 0.0082812775]
actor loss nan
train end 0.5614316463470459
start searching new action
get new action:  0.23239898681640625
start getting new_q
get new_q:  0.06384849548339844
train start:
critic loss [0.017418213, 0.016499564]
critic loss [0.014764538, 0.013979351]
critic loss [0.011957665, 0.01136631]
critic loss [0.0093797911, 0.0089975465]
critic loss [0.00730896, 0.0071112849]
actor loss nan
train e

critic loss [0.0058395406, 0.0057998002]
actor loss nan
train end 0.5209085941314697
should be 1.0, predicted: [ 1.18248439], target predicted: [ 1.03000391]
start searching new action
get new action:  0.220261812210083
start getting new_q
get new_q:  0.06587028503417969
train start:
critic loss [0.0043587564, 0.0043587564]
critic loss [0.0043624341, 0.0043624341]
critic loss [0.0044458853, 0.0044458853]
critic loss [0.0044867564, 0.0044867564]
critic loss [0.0044025043, 0.0044025043]
actor loss nan
train end 0.5380799770355225
start searching new action
get new action:  0.2247321605682373
start getting new_q
get new_q:  0.06423282623291016
train start:
critic loss [0.0077719591, 0.0077719591]
critic loss [0.0076424517, 0.0076424517]
critic loss [0.0071412744, 0.0071412744]
critic loss [0.0063994094, 0.0063994094]
critic loss [0.005569255, 0.005569255]
actor loss nan
train end 0.5355238914489746
start searching new action
get new action:  0.2229595184326172
start getting new_q
get new_

critic loss [0.018093951, 0.0042195469]
critic loss [0.0095812054, 0.0033811568]
critic loss [0.0046457807, 0.0028355308]
critic loss [0.0029472187, 0.0025992119]
actor loss nan
train end 0.5341370105743408
should be 1.0, predicted: [ 1.0319258], target predicted: [ 0.35192135]
start searching new action
get new action:  0.22518086433410645
start getting new_q
get new_q:  0.06450581550598145
train start:
critic loss [0.0051819673, 0.0050651105]
critic loss [0.0048342659, 0.0047553834]
critic loss [0.0044500409, 0.0044079726]
critic loss [0.0040798131, 0.0040639006]
critic loss [0.0037578293, 0.0037549878]
actor loss nan
train end 0.54901123046875
should be 1.0, predicted: [ 0.93711978], target predicted: [ 0.98375976]
start searching new action
get new action:  0.21346020698547363
start getting new_q
get new_q:  0.06826949119567871
train start:
critic loss [0.0055066445, 0.0054519502]
critic loss [0.00589965, 0.0058479682]
critic loss [0.0060063703, 0.0059653269]
critic loss [0.0058445

critic loss [0.0077685718, 0.0070633269]
critic loss [0.0068171974, 0.0062357849]
critic loss [0.0057869339, 0.0053619421]
critic loss [0.0048472276, 0.0045764982]
actor loss nan
train end 0.6719815731048584
should be 1.0, predicted: [ 0.93090689], target predicted: [ 1.04099643]
should be 1.0, predicted: [ 0.91827196], target predicted: [ 1.03592896]
should be 1.0, predicted: [ 0.6762318], target predicted: [ 0.55272973]
start searching new action
get new action:  0.22295498847961426
start getting new_q
get new_q:  0.07213950157165527
train start:
critic loss [0.0039682249, 0.0039503146]
critic loss [0.0046354635, 0.0046348497]
critic loss [0.0050355094, 0.0050315959]
critic loss [0.0050972593, 0.0050825253]
critic loss [0.0048343213, 0.0048123198]
actor loss nan
train end 0.6291904449462891
should be 1.0, predicted: [ 0.98515385], target predicted: [ 1.02426088]
should be 1.0, predicted: [ 0.98515385], target predicted: [ 1.02426088]
start searching new action
get new action:  0.2284

get new action:  0.22025299072265625
start getting new_q
get new_q:  0.06563663482666016
train start:
critic loss [0.0016116727, 0.0016116727]
critic loss [0.0017002984, 0.0017002984]
critic loss [0.0017480979, 0.0017480979]
critic loss [0.0017465882, 0.0017465882]
critic loss [0.0016959566, 0.0016959566]
actor loss nan
train end 0.5144646167755127
start searching new action
get new action:  0.2201838493347168
start getting new_q
get new_q:  0.07671809196472168
train start:
critic loss [0.0020087073, 0.0020087073]
critic loss [0.0019465042, 0.0019465042]
critic loss [0.0018253303, 0.0018253303]
critic loss [0.0016643375, 0.0016643375]
critic loss [0.0014858195, 0.0014858195]
actor loss nan
train end 0.5723040103912354
start searching new action
get new action:  0.2604184150695801
start getting new_q
get new_q:  0.06627225875854492
train start:
critic loss [0.0032610819, 0.0032475647]
critic loss [0.0029145479, 0.002903128]
critic loss [0.0025473279, 0.0025384985]
critic loss [0.0021985

critic loss [0.0053540338, 0.0051993537]
actor loss nan
train end 0.5505852699279785
should be 1.0, predicted: [ 0.9716664], target predicted: [ 1.01166022]
should be 1.0, predicted: [ 0.9716664], target predicted: [ 1.01166022]
start searching new action
get new action:  0.23001456260681152
start getting new_q
get new_q:  0.06964540481567383
train start:
critic loss [0.016186772, 0.014982228]
critic loss [0.015215247, 0.01420445]
critic loss [0.013708225, 0.012943244]
critic loss [0.011964552, 0.011448884]
critic loss [0.010248537, 0.0099487994]
actor loss nan
train end 0.5625317096710205
should be 1.0, predicted: [ 1.04824269], target predicted: [ 1.04139435]
should be 1.0, predicted: [ 1.07717729], target predicted: [ 1.03060997]
should be 1.0, predicted: [ 1.08109832], target predicted: [ 1.0125978]
start searching new action
get new action:  0.2319340705871582
start getting new_q
get new_q:  0.0625917911529541
train start:
critic loss [0.0034270268, 0.003312693]
critic loss [0.004

get new action:  0.22810959815979004
start getting new_q
get new_q:  0.06873130798339844
train start:
critic loss [0.0017598153, 0.0017598153]
critic loss [0.0018317874, 0.0018317874]
critic loss [0.0018340138, 0.0018340138]
critic loss [0.001766377, 0.001766377]
critic loss [0.0016423182, 0.0016423182]
actor loss nan
train end 0.5424461364746094
start searching new action
get new action:  0.22000980377197266
start getting new_q
get new_q:  0.05941939353942871
train start:
critic loss [0.0054128189, 0.0043493183]
critic loss [0.0048918882, 0.0040228567]
critic loss [0.0042483825, 0.0036316421]
critic loss [0.0036197477, 0.0032379138]
critic loss [0.0031078383, 0.0028894669]
actor loss nan
train end 0.5283999443054199
should be 1.0, predicted: [ 0.96428555], target predicted: [ 1.03406715]
should be 1.0, predicted: [ 0.90525085], target predicted: [ 1.03201175]
should be 1.0, predicted: [ 0.92355663], target predicted: [ 1.02062774]
start searching new action
get new action:  0.21239352

critic loss [0.0027471469, 0.0027452232]
critic loss [0.0029113407, 0.0029109495]
critic loss [0.0030114665, 0.0030114488]
critic loss [0.003024389, 0.0030237071]
actor loss nan
train end 0.5816144943237305
should be 1.0, predicted: [ 1.06057107], target predicted: [ 1.02747273]
start searching new action
get new action:  0.22100448608398438
start getting new_q
get new_q:  0.06253695487976074
train start:
critic loss [0.0044330596, 0.004218949]
critic loss [0.0041924352, 0.0040095723]
critic loss [0.0037140958, 0.0036015245]
critic loss [0.0031203348, 0.0030797455]
critic loss [0.0025353811, 0.0025334735]
actor loss nan
train end 0.518890380859375
should be -1.0, predicted: [-0.92176491], target predicted: [-0.9080978]
start searching new action
get new action:  0.22449040412902832
start getting new_q
get new_q:  0.06766748428344727
train start:
critic loss [0.01059721, 0.0098537616]
critic loss [0.010514379, 0.0098940358]
critic loss [0.0099617615, 0.0095376633]
critic loss [0.0091254

critic loss [0.0073824665, 0.0070116934]
critic loss [0.0079207867, 0.0075450162]
critic loss [0.0078251567, 0.0075237327]
critic loss [0.0072101047, 0.0070245974]
actor loss nan
train end 0.5033483505249023
should be 1.0, predicted: [ 1.03617024], target predicted: [ 1.00750756]
should be 1.0, predicted: [ 1.03617024], target predicted: [ 1.00750756]
start searching new action
get new action:  0.2080984115600586
start getting new_q
get new_q:  0.06173300743103027
train start:
critic loss [0.0047926214, 0.0047926214]
critic loss [0.0044290517, 0.0044290517]
critic loss [0.0040870477, 0.0040870477]
critic loss [0.0038042164, 0.0038042164]
critic loss [0.0035924995, 0.0035924995]
actor loss nan
train end 0.5186188220977783
start searching new action
get new action:  0.21918153762817383
start getting new_q
get new_q:  0.05995297431945801
train start:
critic loss [0.0033009253, 0.0033009253]
critic loss [0.0030440008, 0.0030440008]
critic loss [0.0028289205, 0.0028289205]
critic loss [0.00

get new action:  0.2165982723236084
start getting new_q
get new_q:  0.06242632865905762
train start:
critic loss [0.0018866879, 0.0018161632]
critic loss [0.0018800731, 0.0018112911]
critic loss [0.0018025618, 0.0017440878]
critic loss [0.001669646, 0.0016267821]
critic loss [0.00150516, 0.0014791446]
actor loss nan
train end 0.5146276950836182
should be 1.0, predicted: [ 1.03308368], target predicted: [ 1.00921547]
start searching new action
get new action:  0.22082138061523438
start getting new_q
get new_q:  0.06430363655090332
train start:
critic loss [0.00081809022, 0.00081809022]
critic loss [0.00089266128, 0.00089266128]
critic loss [0.00093126635, 0.00093126635]
critic loss [0.00092744594, 0.00092744594]
critic loss [0.00088320114, 0.00088320114]
actor loss nan
train end 0.5098843574523926
start searching new action
get new action:  0.21350646018981934
start getting new_q
get new_q:  0.06353139877319336
train start:
critic loss [0.0023252093, 0.0023252093]
critic loss [0.0021818

get new action:  0.22541069984436035
start getting new_q
get new_q:  0.07556486129760742
train start:
critic loss [0.015320378, 0.014598575]
critic loss [0.014233948, 0.013501899]
critic loss [0.012769174, 0.012078565]
critic loss [0.011188833, 0.010582346]
critic loss [0.0097210733, 0.0092252614]
actor loss nan
train end 0.672631025314331
should be 1.0, predicted: [ 0.98750067], target predicted: [ 1.02684295]
start searching new action
get new action:  0.2317492961883545
start getting new_q
get new_q:  0.07631397247314453
train start:
critic loss [0.0073454734, 0.0071233958]
critic loss [0.0085837562, 0.0083431322]
critic loss [0.009133704, 0.0088988459]
critic loss [0.0089692622, 0.0087628365]
critic loss [0.0082137519, 0.0080505563]
actor loss nan
train end 0.6793637275695801
should be 1.0, predicted: [ 0.97532195], target predicted: [ 1.00234342]
Episode : 1117 Replay Buffer 50000
TOTAL REWARD @ 1117-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:

get new action:  0.22040462493896484
start getting new_q
get new_q:  0.06645679473876953
train start:
critic loss [0.0044239927, 0.0041751936]
critic loss [0.0039733378, 0.0038026113]
critic loss [0.0035532778, 0.0034524268]
critic loss [0.0032099644, 0.0031597512]
critic loss [0.0029632817, 0.0029402163]
actor loss nan
train end 0.5391573905944824
should be 1.0, predicted: [ 0.95906675], target predicted: [ 1.04419756]
should be 1.0, predicted: [ 0.94040924], target predicted: [ 1.01360822]
start searching new action
get new action:  0.23937106132507324
start getting new_q
get new_q:  0.06978297233581543
train start:
critic loss [0.0046457956, 0.0046457956]
critic loss [0.0044558113, 0.0044558113]
critic loss [0.0042708642, 0.0042708642]
critic loss [0.0040901192, 0.0040901192]
critic loss [0.00391063, 0.00391063]
actor loss nan
train end 0.6289451122283936
start searching new action
get new action:  0.23517799377441406
start getting new_q
get new_q:  0.06567144393920898
train start:


Episode : 1121 Replay Buffer 50000
TOTAL REWARD @ 1121-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2662355899810791
start getting new_q
get new_q:  0.07115483283996582
train start:
critic loss [0.0019795196, 0.0019795196]
critic loss [0.0021808022, 0.0021808022]
critic loss [0.0023272811, 0.0023272811]
critic loss [0.0023775147, 0.0023775147]
critic loss [0.00231667, 0.00231667]
actor loss nan
train end 0.9597809314727783
start searching new action
get new action:  0.23720073699951172
start getting new_q
get new_q:  0.08191895484924316
train start:
critic loss [0.0070072222, 0.0068891803]
critic loss [0.0068284329, 0.0067562172]
critic loss [0.0064980621, 0.0064631128]
critic loss [0.0060761711, 0.0060654962]
critic loss [0.0056235297, 0.0056230137]
actor loss nan
train end 0.8043005466461182
should be 1.0, predicted: [ 0.87402856], target predicted: [ 1.01921332]
start searching new action
get new action:  0.21710515022277832
start getting new_

critic loss [0.0020210207, 0.0020210207]
critic loss [0.0020417934, 0.0020417934]
critic loss [0.0020423154, 0.0020423154]
actor loss nan
train end 0.941669225692749
start searching new action
get new action:  0.23274827003479004
start getting new_q
get new_q:  0.0655829906463623
train start:
critic loss [0.0039551202, 0.0038340769]
critic loss [0.0037481824, 0.0036439584]
critic loss [0.0033032643, 0.0032221586]
critic loss [0.0027151222, 0.0026588796]
critic loss [0.0020858608, 0.0020523644]
actor loss nan
train end 0.576852560043335
should be 1.0, predicted: [ 1.08140981], target predicted: [ 1.04477167]
start searching new action
get new action:  0.23450994491577148
start getting new_q
get new_q:  0.07687854766845703
train start:
critic loss [0.027606275, 0.0045208251]
critic loss [0.021069439, 0.0041453778]
critic loss [0.012887402, 0.0036967006]
critic loss [0.0071830484, 0.0035978514]
critic loss [0.0046003368, 0.0038362383]
actor loss nan
train end 0.6311595439910889
should be 

critic loss [0.0047434699, 0.0044642864]
critic loss [0.0040765456, 0.0038531662]
critic loss [0.0033231534, 0.0031619137]
critic loss [0.00260618, 0.0025047106]
actor loss nan
train end 0.7210519313812256
should be 1.0, predicted: [ 1.01045418], target predicted: [ 1.03465319]
start searching new action
get new action:  0.21623778343200684
start getting new_q
get new_q:  0.0668497085571289
train start:
critic loss [0.00305859, 0.0030369132]
critic loss [0.0033179878, 0.0032952905]
critic loss [0.0033754427, 0.0033554765]
critic loss [0.0032423579, 0.0032273645]
critic loss [0.0029783659, 0.0029690629]
actor loss nan
train end 0.5692095756530762
should be 1.0, predicted: [ 0.95092589], target predicted: [ 1.00353646]
start searching new action
get new action:  0.21263766288757324
start getting new_q
get new_q:  0.06625771522521973
train start:
critic loss [0.0018098007, 0.0017911162]
critic loss [0.0019325097, 0.0019164067]
critic loss [0.0019826817, 0.0019707996]
critic loss [0.001936

critic loss [0.0030388418, 0.0025518446]
critic loss [0.0028140547, 0.0024209989]
critic loss [0.0025580963, 0.0022745235]
critic loss [0.0023003253, 0.0021214897]
actor loss nan
train end 0.5056653022766113
should be 1.0, predicted: [ 1.06542051], target predicted: [ 1.04141688]
start searching new action
get new action:  0.21480369567871094
start getting new_q
get new_q:  0.06354212760925293
train start:
critic loss [0.00080910942, 0.00073297956]
critic loss [0.00082998746, 0.00080765295]
critic loss [0.00085122109, 0.00085016561]
critic loss [0.00085592829, 0.00084966235]
critic loss [0.00083301443, 0.00080556225]
actor loss nan
train end 0.5405135154724121
should be 1.0, predicted: [ 1.02245951], target predicted: [ 1.01784122]
should be 1.0, predicted: [ 0.99943531], target predicted: [ 1.04104805]
start searching new action
get new action:  0.2171001434326172
start getting new_q
get new_q:  0.06342363357543945
train start:
critic loss [0.0026789163, 0.0026789163]
critic loss [0.0

critic loss [0.0011110692, 0.0011110692]
critic loss [0.0010903233, 0.0010903233]
critic loss [0.0010909957, 0.0010909957]
critic loss [0.0010905159, 0.0010905159]
critic loss [0.0010702097, 0.0010702097]
actor loss nan
train end 0.7875351905822754
start searching new action
get new action:  0.2227022647857666
start getting new_q
get new_q:  0.07750153541564941
train start:
critic loss [0.0055345846, 0.0051077679]
critic loss [0.0053860229, 0.0049816277]
critic loss [0.0048861974, 0.0045460123]
critic loss [0.004184173, 0.0039323298]
critic loss [0.003442524, 0.0032831538]
actor loss nan
train end 0.6116406917572021
should be 1.0, predicted: [ 1.03138065], target predicted: [ 0.99791569]
start searching new action
get new action:  0.21454787254333496
start getting new_q
get new_q:  0.06380558013916016
train start:
critic loss [0.0022508563, 0.0021620383]
critic loss [0.0026346054, 0.0024976665]
critic loss [0.0028622581, 0.0026908659]
critic loss [0.0028676894, 0.0026872165]
critic los

critic loss [0.0029157693, 0.0029157693]
actor loss nan
train end 0.5089001655578613
start searching new action
get new action:  0.21270203590393066
start getting new_q
get new_q:  0.07062220573425293
train start:
critic loss [0.013024068, 0.013023326]
critic loss [0.011747737, 0.011747736]
critic loss [0.010149765, 0.010148618]
critic loss [0.008530993, 0.008526545]
critic loss [0.0071257474, 0.0071164113]
actor loss nan
train end 0.5220911502838135
should be 1.0, predicted: [ 0.90645635], target predicted: [ 1.02063572]
start searching new action
get new action:  0.21946048736572266
start getting new_q
get new_q:  0.06278204917907715
train start:
critic loss [0.0036045141, 0.0036002505]
critic loss [0.0046983128, 0.0046932623]
critic loss [0.0053211055, 0.0053159315]
critic loss [0.0053947642, 0.0053901067]
critic loss [0.0049795853, 0.0049758637]
actor loss nan
train end 0.542475700378418
should be 1.0, predicted: [ 0.97261959], target predicted: [ 1.018345]
Episode : 1132 Replay Bu

get new action:  0.21259498596191406
start getting new_q
get new_q:  0.0645747184753418
train start:
critic loss [0.0068726833, 0.0049203709]
critic loss [0.0075246547, 0.00559698]
critic loss [0.0071459049, 0.0055947569]
critic loss [0.0060187085, 0.0050174068]
critic loss [0.0045694271, 0.0040909657]
actor loss nan
train end 0.5306549072265625
should be 1.0, predicted: [ 0.97844416], target predicted: [ 1.00075603]
should be 1.0, predicted: [ 1.02166879], target predicted: [ 0.99952799]
should be 1.0, predicted: [ 1.0032084], target predicted: [ 0.99938089]
Episode : 1134 Replay Buffer 50000
TOTAL REWARD @ 1134-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.22110414505004883
start getting new_q
get new_q:  0.06678962707519531
train start:
critic loss [0.0013666827, 0.0013666827]
critic loss [0.00096378231, 0.00096378231]
critic loss [0.00077211385, 0.00077211385]
critic loss [0.0007674708, 0.0007674708]
critic loss [0.00088358106, 0.00088358106]


critic loss [0.0021671033, 0.0021671033]
critic loss [0.0019101675, 0.0019101675]
critic loss [0.0016082959, 0.0016082959]
critic loss [0.0013026383, 0.0013026383]
actor loss nan
train end 0.5440733432769775
Episode : 1136 Replay Buffer 50000
TOTAL REWARD @ 1136-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.22611069679260254
start getting new_q
get new_q:  0.08442974090576172
train start:
critic loss [0.0033436201, 0.0033436201]
critic loss [0.0032644495, 0.0032644495]
critic loss [0.0030904771, 0.0030904771]
critic loss [0.0028492068, 0.0028492068]
critic loss [0.0025796648, 0.0025796648]
actor loss nan
train end 0.6071348190307617
start searching new action
get new action:  0.20526480674743652
start getting new_q
get new_q:  0.06574845314025879
train start:
critic loss [0.003823176, 0.0030465734]
critic loss [0.0038053053, 0.0031540249]
critic loss [0.0034912333, 0.0030086935]
critic loss [0.0029875708, 0.0026724932]
critic loss [0.0024195137, 0

critic loss [0.00091966457, 0.00088519225]
actor loss nan
train end 0.5601992607116699
should be 1.0, predicted: [ 1.02795172], target predicted: [ 1.00318623]
start searching new action
get new action:  0.21233057975769043
start getting new_q
get new_q:  0.08170270919799805
train start:
critic loss [0.001199313, 0.00085014466]
critic loss [0.0011558753, 0.00084355276]
critic loss [0.0010056536, 0.00076369534]
critic loss [0.00079494459, 0.00063602772]
critic loss [0.0005779608, 0.00049436081]
actor loss nan
train end 0.5405302047729492
should be 1.0, predicted: [ 1.02778256], target predicted: [ 1.00352561]
should be 1.0, predicted: [ 1.01517737], target predicted: [ 0.99824661]
start searching new action
get new action:  0.2133805751800537
start getting new_q
get new_q:  0.06726455688476562
train start:
critic loss [0.0013090484, 0.001258206]
critic loss [0.001209311, 0.0011464225]
critic loss [0.0010793821, 0.0010119628]
critic loss [0.00093267782, 0.0008688377]
critic loss [0.00078

critic loss [0.0020035952, 0.0019744327]
actor loss nan
train end 0.5553195476531982
should be 1.0, predicted: [ 0.99277937], target predicted: [ 0.99398595]
start searching new action
get new action:  0.22680377960205078
start getting new_q
get new_q:  0.06406784057617188
train start:
critic loss [0.0041596629, 0.003347504]
critic loss [0.0038055133, 0.0029926978]
critic loss [0.0032641944, 0.0025697509]
critic loss [0.0026476893, 0.0021467567]
critic loss [0.0020686011, 0.0017773663]
actor loss nan
train end 0.5376431941986084
should be 1.0, predicted: [ 1.01008761], target predicted: [ 0.99887168]
should be 1.0, predicted: [ 0.97606343], target predicted: [ 0.99679184]
start searching new action
get new action:  0.23999404907226562
start getting new_q
get new_q:  0.08453869819641113
train start:
critic loss [0.0012645714, 0.0012053503]
critic loss [0.0010650895, 0.00098925713]
critic loss [0.00091219519, 0.00083083881]
critic loss [0.00081197347, 0.00073562918]
critic loss [0.000761

critic loss [0.0018792592, 0.0017137197]
actor loss nan
train end 0.5825653076171875
should be 1.0, predicted: [ 1.00342262], target predicted: [ 0.99474007]
should be 1.0, predicted: [ 1.06956887], target predicted: [ 1.01380575]
start searching new action
get new action:  0.22283649444580078
start getting new_q
get new_q:  0.06949377059936523
train start:
critic loss [0.0038753678, 0.0038152863]
critic loss [0.0037334589, 0.003695542]
critic loss [0.0034578578, 0.0034388117]
critic loss [0.0031203113, 0.0031138284]
critic loss [0.002781786, 0.0027810545]
actor loss nan
train end 0.6500964164733887
should be 1.0, predicted: [ 1.08861363], target predicted: [ 1.02596831]
start searching new action
get new action:  0.2352581024169922
start getting new_q
get new_q:  0.06442546844482422
train start:
critic loss [0.0026408611, 0.0020964495]
critic loss [0.0023780768, 0.0019271544]
critic loss [0.0020887547, 0.0017492468]
critic loss [0.0018052487, 0.001566017]
critic loss [0.001549369, 0.0

critic loss [0.0018076061, 0.0017777132]
critic loss [0.0020577738, 0.0020502424]
critic loss [0.0021783996, 0.0021781456]
critic loss [0.0021329583, 0.0021316321]
actor loss nan
train end 0.5271120071411133
should be 1.0, predicted: [ 0.91958892], target predicted: [ 0.99647689]
start searching new action
get new action:  0.22508621215820312
start getting new_q
get new_q:  0.06663775444030762
train start:
critic loss [0.0044597094, 0.0035009412]
critic loss [0.0038949137, 0.0029868705]
critic loss [0.0030437759, 0.0022719149]
critic loss [0.0021243584, 0.0015360231]
critic loss [0.0013159679, 0.00092630653]
actor loss nan
train end 0.6984801292419434
should be 1.0, predicted: [ 0.93827367], target predicted: [ 1.0113523]
start searching new action
get new action:  0.26545095443725586
start getting new_q
get new_q:  0.07404208183288574
train start:
critic loss [0.0027514654, 0.0017839789]
critic loss [0.0021391658, 0.0015453405]
critic loss [0.0017898036, 0.0015049009]
critic loss [0.0

get new action:  0.23168730735778809
start getting new_q
get new_q:  0.06958818435668945
train start:
critic loss [0.016794905, 0.016794905]
critic loss [0.015593642, 0.015593642]
critic loss [0.013827614, 0.013827614]
critic loss [0.011897899, 0.011897899]
critic loss [0.010167303, 0.010167303]
actor loss nan
train end 0.6185798645019531
start searching new action
get new action:  0.2267928123474121
start getting new_q
get new_q:  0.06677532196044922
train start:
critic loss [0.0036727171, 0.0028360037]
critic loss [0.0043169856, 0.0037320126]
critic loss [0.0048703956, 0.0045285113]
critic loss [0.0051735006, 0.0050081778]
critic loss [0.0051446818, 0.0050690277]
actor loss nan
train end 0.5818274021148682
should be 1.0, predicted: [ 0.9011113], target predicted: [ 0.99720931]
should be 1.0, predicted: [ 0.9282707], target predicted: [ 1.01818776]
should be 1.0, predicted: [ 0.88589615], target predicted: [ 1.00536716]
start searching new action
get new action:  0.22044587135314941
s

get new action:  0.23272442817687988
start getting new_q
get new_q:  0.06626772880554199
train start:
critic loss [0.0010333067, 0.0010333067]
critic loss [0.00079043047, 0.00079043047]
critic loss [0.00068028353, 0.00068028353]
critic loss [0.00067625713, 0.00067625713]
critic loss [0.00073415024, 0.00073415024]
actor loss nan
train end 0.7662055492401123
start searching new action
get new action:  0.277087926864624
start getting new_q
get new_q:  0.08665657043457031
train start:
critic loss [0.0015823632, 0.0012061573]
critic loss [0.0014067823, 0.0011028973]
critic loss [0.0012310313, 0.0010088664]
critic loss [0.0010733262, 0.00092867739]
critic loss [0.00094432273, 0.00086310139]
actor loss nan
train end 0.7403240203857422
should be 1.0, predicted: [ 0.98318362], target predicted: [ 0.99224687]
start searching new action
get new action:  0.22899556159973145
start getting new_q
get new_q:  0.06326818466186523
train start:
critic loss [0.0031758572, 0.0029581976]
critic loss [0.0031

get new action:  0.2372570037841797
start getting new_q
get new_q:  0.07460570335388184
train start:
critic loss [0.001980057, 0.001880573]
critic loss [0.0017192964, 0.0016350194]
critic loss [0.0014499534, 0.0013940877]
critic loss [0.0012231893, 0.0011963426]
critic loss [0.0010698478, 0.0010632083]
actor loss nan
train end 0.6783099174499512
should be 1.0, predicted: [ 1.06573439], target predicted: [ 1.00474465]
should be 1.0, predicted: [ 1.05905962], target predicted: [ 0.99287522]
should be 1.0, predicted: [ 1.07194722], target predicted: [ 1.01427591]
start searching new action
get new action:  0.21831297874450684
start getting new_q
get new_q:  0.06705284118652344
train start:
critic loss [0.00071968255, 0.00058499846]
critic loss [0.00061056693, 0.00053786673]
critic loss [0.00055192591, 0.00052406639]
critic loss [0.00054563815, 0.00053495442]
critic loss [0.00056938094, 0.00055378792]
actor loss nan
train end 0.5423192977905273
should be 1.0, predicted: [ 1.00828147], targ

get new action:  0.2430582046508789
start getting new_q
get new_q:  0.07110404968261719
train start:
critic loss [0.0041594803, 0.0041594803]
critic loss [0.0038511488, 0.0038511488]
critic loss [0.0033082217, 0.0033082217]
critic loss [0.0026413174, 0.0026413174]
critic loss [0.0019600387, 0.0019600387]
actor loss nan
train end 0.5909171104431152
start searching new action
get new action:  0.22449111938476562
start getting new_q
get new_q:  0.06413745880126953
train start:
critic loss [0.003034831, 0.003034831]
critic loss [0.0021557147, 0.0021557147]
critic loss [0.0014399779, 0.0014399779]
critic loss [0.00095856626, 0.00095856626]
critic loss [0.00071879744, 0.00071879744]
actor loss nan
train end 0.5427577495574951
start searching new action
get new action:  0.2163705825805664
start getting new_q
get new_q:  0.06251335144042969
train start:
critic loss [0.0067869383, 0.0065958081]
critic loss [0.0073235352, 0.0071701482]
critic loss [0.0073676836, 0.0072627338]
critic loss [0.0069

actor loss nan
train end 0.8482184410095215
should be 1.0, predicted: [ 0.98454094], target predicted: [ 1.00151861]
should be 1.0, predicted: [ 0.98454094], target predicted: [ 1.00151861]
should be 1.0, predicted: [ 0.99953264], target predicted: [ 0.99897593]
start searching new action
get new action:  0.22018218040466309
start getting new_q
get new_q:  0.07906413078308105
train start:
critic loss [0.0074809985, 0.0074241976]
critic loss [0.0063872123, 0.0063579176]
critic loss [0.0053739687, 0.0053614727]
critic loss [0.004562268, 0.0045582633]
critic loss [0.0040082643, 0.0040074987]
actor loss nan
train end 0.7986330986022949
should be 1.0, predicted: [ 0.95403093], target predicted: [ 0.99003214]
start searching new action
get new action:  0.28018951416015625
start getting new_q
get new_q:  0.09380316734313965
train start:
critic loss [0.0034946469, 0.0029655849]
critic loss [0.0041405559, 0.0036370985]
critic loss [0.0044196961, 0.0040012039]
critic loss [0.0043202145, 0.004018

critic loss [0.0027974262, 0.002797402]
actor loss nan
train end 0.530235767364502
should be 1.0, predicted: [ 1.03401661], target predicted: [ 1.00492251]
start searching new action
get new action:  0.2176971435546875
start getting new_q
get new_q:  0.06126880645751953
train start:
critic loss [0.0041358108, 0.0040864144]
critic loss [0.0036930095, 0.0036287252]
critic loss [0.0033043828, 0.0032348991]
critic loss [0.0030038119, 0.0029398324]
critic loss [0.0028022227, 0.0027523269]
actor loss nan
train end 0.7827615737915039
should be 1.0, predicted: [ 0.98546618], target predicted: [ 1.00262928]
start searching new action
get new action:  0.234083890914917
start getting new_q
get new_q:  0.06918835639953613
train start:
critic loss [0.002278687, 0.002240493]
critic loss [0.002199898, 0.002163765]
critic loss [0.0020915926, 0.002054783]
critic loss [0.0019599418, 0.0019181331]
critic loss [0.0018163528, 0.0017664644]
actor loss nan
train end 0.6435854434967041
should be 1.0, predicte

critic loss [0.0087287789, 0.00859641]
critic loss [0.0087268474, 0.0085925069]
critic loss [0.0082663633, 0.0081383511]
critic loss [0.0074875569, 0.0073728771]
actor loss nan
train end 0.6201767921447754
should be 1.0, predicted: [ 1.00641191], target predicted: [ 0.99915749]
start searching new action
get new action:  0.28121495246887207
start getting new_q
get new_q:  0.1064448356628418
train start:
critic loss [0.0035528801, 0.0034799515]
critic loss [0.0029569392, 0.0028899652]
critic loss [0.0024923058, 0.0024354779]
critic loss [0.0021922213, 0.0021483484]
critic loss [0.0020478393, 0.0020176107]
actor loss nan
train end 0.9055624008178711
should be 1.0, predicted: [ 0.99163955], target predicted: [ 0.99578416]
start searching new action
get new action:  0.23893165588378906
start getting new_q
get new_q:  0.0631256103515625
train start:
critic loss [0.005049088, 0.0049806782]
critic loss [0.0052943788, 0.0052340403]
critic loss [0.0049325018, 0.0048852125]
critic loss [0.004132

critic loss [0.0046043126, 0.0045868293]
critic loss [0.0041421135, 0.0041312575]
critic loss [0.0035889908, 0.0035842755]
critic loss [0.0030323244, 0.0030312575]
actor loss nan
train end 0.5151693820953369
should be 1.0, predicted: [ 1.06499541], target predicted: [ 0.99915791]
start searching new action
get new action:  0.2164156436920166
start getting new_q
get new_q:  0.06247711181640625
train start:
critic loss [0.0011165091, 0.0010839927]
critic loss [0.0013505013, 0.001323336]
critic loss [0.0015161752, 0.0014966374]
critic loss [0.0015816274, 0.001569914]
critic loss [0.0015472354, 0.0015418024]
actor loss nan
train end 0.5161526203155518
should be 1.0, predicted: [ 1.03933513], target predicted: [ 1.0040952]
start searching new action
get new action:  0.21141815185546875
start getting new_q
get new_q:  0.0604705810546875
train start:
critic loss [0.0043960307, 0.0043960307]
critic loss [0.0040878272, 0.0040878272]
critic loss [0.0036622658, 0.0036622658]
critic loss [0.003206

get new action:  0.2160646915435791
start getting new_q
get new_q:  0.060375213623046875
train start:
critic loss [0.002935166, 0.0028890935]
critic loss [0.0020058108, 0.0019813625]
critic loss [0.0013274006, 0.0013171462]
critic loss [0.00093769759, 0.00093369826]
critic loss [0.00081617187, 0.00081151788]
actor loss nan
train end 0.5408945083618164
should be 1.0, predicted: [ 1.05579364], target predicted: [ 1.00872171]
should be 1.0, predicted: [ 1.04656684], target predicted: [ 1.02194309]
start searching new action
get new action:  0.21080803871154785
start getting new_q
get new_q:  0.06442546844482422
train start:
critic loss [0.002190388, 0.0021117639]
critic loss [0.0025983895, 0.0025470089]
critic loss [0.0027823681, 0.0027586394]
critic loss [0.0027356632, 0.0027301498]
critic loss [0.0025014044, 0.0024996407]
actor loss nan
train end 0.49417757987976074
should be 1.0, predicted: [ 1.03654277], target predicted: [ 1.00518763]
should be 1.0, predicted: [ 1.04711843], target p

get new action:  0.226531982421875
start getting new_q
get new_q:  0.07298564910888672
train start:
critic loss [0.0026999023, 0.0026999023]
critic loss [0.0033074333, 0.0033074333]
critic loss [0.0038619726, 0.0038619726]
critic loss [0.004233079, 0.004233079]
critic loss [0.0043540895, 0.0043540895]
actor loss nan
train end 0.7519094944000244
Episode : 1165 Replay Buffer 50000
TOTAL REWARD @ 1165-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.3238341808319092
start getting new_q
get new_q:  0.09209132194519043
train start:
critic loss [0.001604868, 0.001550391]
critic loss [0.0015463656, 0.0014914876]
critic loss [0.001426087, 0.0013777388]
critic loss [0.0012699591, 0.0012327686]
critic loss [0.0011057341, 0.0010811533]
actor loss nan
train end 0.98232102394104
should be 1.0, predicted: [ 1.06304264], target predicted: [ 0.99365258]
start searching new action
get new action:  0.23805546760559082
start getting new_q
get new_q:  0.0623595714569091

critic loss [0.0051366379, 0.0051366379]
critic loss [0.005209812, 0.005209812]
critic loss [0.0048982915, 0.0048982915]
critic loss [0.0043097902, 0.0043097902]
actor loss nan
train end 0.6031808853149414
start searching new action
get new action:  0.21198296546936035
start getting new_q
get new_q:  0.06571364402770996
train start:
critic loss [0.0044132052, 0.0042532566]
critic loss [0.0044869324, 0.0043513598]
critic loss [0.0044150027, 0.0043244888]
critic loss [0.0042073634, 0.0041642478]
critic loss [0.0039065746, 0.0038964285]
actor loss nan
train end 0.7193059921264648
should be 1.0, predicted: [ 0.95217419], target predicted: [ 0.99721491]
should be 1.0, predicted: [ 0.95953631], target predicted: [ 0.99554151]
start searching new action
get new action:  0.2141432762145996
start getting new_q
get new_q:  0.06509065628051758
train start:
critic loss [0.0042291768, 0.00406841]
critic loss [0.0038224985, 0.0037232111]
critic loss [0.0033285404, 0.0032831905]
critic loss [0.002849

critic loss [0.0013727201, 0.0010135162]
actor loss nan
train end 1.1948354244232178
should be 1.0, predicted: [ 1.01017594], target predicted: [ 1.03461015]
should be 1.0, predicted: [ 0.96904767], target predicted: [ 0.99783307]
should be 1.0, predicted: [ 1.02596581], target predicted: [ 1.01518631]
should be 1.0, predicted: [ 0.97500777], target predicted: [ 0.99384713]
start searching new action
get new action:  0.24121594429016113
start getting new_q
get new_q:  0.07845520973205566
train start:
critic loss [0.0050790543, 0.0048358971]
critic loss [0.0044077588, 0.0043105767]
critic loss [0.0038532815, 0.0038343999]
critic loss [0.0034684166, 0.0034679845]
critic loss [0.0032531845, 0.003229612]
actor loss nan
train end 0.637019157409668
should be 1.0, predicted: [ 0.84735096], target predicted: [ 0.99642825]
start searching new action
get new action:  0.21852993965148926
start getting new_q
get new_q:  0.06864166259765625
train start:
critic loss [0.016075611, 0.011310451]
critic

critic loss [0.0030967989, 0.0030967989]
actor loss nan
train end 0.5177803039550781
Episode : 1171 Replay Buffer 50000
TOTAL REWARD @ 1171-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2852442264556885
start getting new_q
get new_q:  0.0823659896850586
train start:
critic loss [0.0015530161, 0.001301685]
critic loss [0.0015657754, 0.0013574373]
critic loss [0.0014016806, 0.0012643703]
critic loss [0.0011342017, 0.0010632765]
critic loss [0.0008496549, 0.00081527146]
actor loss nan
train end 0.937028169631958
should be 1.0, predicted: [ 0.98692107], target predicted: [ 0.99263138]
should be 1.0, predicted: [ 1.03743505], target predicted: [ 1.03580415]
should be 1.0, predicted: [ 1.04366624], target predicted: [ 0.99670535]
should be 1.0, predicted: [ 1.03743505], target predicted: [ 1.03580415]
start searching new action
get new action:  0.22929787635803223
start getting new_q
get new_q:  0.07416915893554688
train start:
critic loss [0.0013097704

Episode : 1173 Replay Buffer 50000
TOTAL REWARD @ 1173-th Episode  : Reward 1
Total Step: 186
start searching new action
get new action:  0.2290782928466797
start getting new_q
get new_q:  0.06722044944763184
train start:
critic loss [0.003638471, 0.0035439108]
critic loss [0.0033967651, 0.0033316256]
critic loss [0.0031960073, 0.0031593582]
critic loss [0.0030557083, 0.0030410066]
critic loss [0.0029744697, 0.0029720208]
actor loss nan
train end 0.5304536819458008
should be 1.0, predicted: [ 1.11002636], target predicted: [ 0.99855512]
start searching new action
get new action:  0.20797228813171387
start getting new_q
get new_q:  0.06840658187866211
train start:
critic loss [0.0013790054, 0.0011261504]
critic loss [0.0014857557, 0.0011922002]
critic loss [0.0014283108, 0.0011647976]
critic loss [0.0012396385, 0.0010568141]
critic loss [0.00099892181, 0.00090645393]
actor loss nan
train end 0.5095870494842529
should be 1.0, predicted: [ 1.05488074], target predicted: [ 1.01455271]
shou

critic loss [0.00042682665, 0.00042682665]
critic loss [0.00034276705, 0.00034276705]
actor loss nan
train end 0.6239204406738281
start searching new action
get new action:  0.22012996673583984
start getting new_q
get new_q:  0.07381343841552734
train start:
critic loss [0.0043586064, 0.0043586064]
critic loss [0.0043206038, 0.0043206038]
critic loss [0.003981675, 0.003981675]
critic loss [0.0034548901, 0.0034548901]
critic loss [0.0028752144, 0.0028752144]
actor loss nan
train end 0.68678879737854
start searching new action
get new action:  0.22983145713806152
start getting new_q
get new_q:  0.07631993293762207
train start:
critic loss [0.0067217182, 0.0064895563]
critic loss [0.0061818222, 0.0059623243]
critic loss [0.0057306755, 0.0055439118]
critic loss [0.0054081245, 0.0052662971]
critic loss [0.0052206642, 0.0051226243]
actor loss nan
train end 0.6114988327026367
should be 1.0, predicted: [ 1.02852452], target predicted: [ 1.01745546]
start searching new action
get new action:  0

get new action:  0.23708415031433105
start getting new_q
get new_q:  0.07388734817504883
train start:
critic loss [0.0012716691, 0.0012657929]
critic loss [0.0010113145, 0.0010064754]
critic loss [0.00077108154, 0.00076769013]
critic loss [0.00058607379, 0.00058411789]
critic loss [0.00047619478, 0.00047536567]
actor loss nan
train end 0.6563844680786133
should be 1.0, predicted: [ 0.95471066], target predicted: [ 0.99234682]
start searching new action
get new action:  0.2159252166748047
start getting new_q
get new_q:  0.07383418083190918
train start:
critic loss [0.0024801183, 0.0024397504]
critic loss [0.0026711235, 0.0026336282]
critic loss [0.0026278924, 0.0025950987]
critic loss [0.0023769555, 0.0023499047]
critic loss [0.0019893537, 0.0019682338]
actor loss nan
train end 0.5663590431213379
should be 1.0, predicted: [ 0.96146441], target predicted: [ 0.99573278]
start searching new action
get new action:  0.21525168418884277
start getting new_q
get new_q:  0.06676506996154785
trai

get new action:  0.21790242195129395
start getting new_q
get new_q:  0.06296038627624512
train start:
critic loss [0.0028454801, 0.0028454801]
critic loss [0.0021993639, 0.0021993639]
critic loss [0.00157927, 0.00157927]
critic loss [0.0010946628, 0.0010946628]
critic loss [0.000792932, 0.000792932]
actor loss nan
train end 0.5931017398834229
start searching new action
get new action:  0.22494292259216309
start getting new_q
get new_q:  0.06324052810668945
train start:
critic loss [0.0041898382, 0.0040059602]
critic loss [0.0043581827, 0.004173758]
critic loss [0.0044557145, 0.0042850985]
critic loss [0.0044451384, 0.0042995531]
critic loss [0.0043208115, 0.004206812]
actor loss nan
train end 0.5944252014160156
should be 1.0, predicted: [ 0.98867297], target predicted: [ 0.99031293]
start searching new action
get new action:  0.22285795211791992
start getting new_q
get new_q:  0.06232118606567383
train start:
critic loss [0.0018292367, 0.0015884687]
critic loss [0.0016891879, 0.0015016

KeyboardInterrupt: 

In [50]:
graph = 0

def reinforcement_data_generator(actor_model, actor_target, critic_model, critic_target, relay, \
                                 train_critic=True, step_per_episode=50, \
                                 BATCH_SIZE=64, GAMMA=0.99, RELAY_SIZE=10000): 
    global graph
    with graph.as_default():
        max_score = 1
        min_score = 0.02
        policy_mask_init = 0.01

        actor = ActorNetwork(target_model=actor_target, model=actor_model)
        critic = CriticNetwork(target_model=critic_target, model=actor_model)
        print("actor critic prepared")

        while True:
            reward = 0
            step = 0

            game = Ataxx()
            turn = -1 # start turn is -1

            while abs(game.evaluate(turn, turn, max_score, min_score)) != max_score:
                new_sample = [] # prepare to collect new sample

                pos0, pos1, cur_action_mask = actor.get_noised_action(game, turn, epsilon=0.1, mask_init=policy_mask_init, verbose=0) # get proposed action

                new_state = game.simu_move_to(turn, pos0, pos1)

                # create new_sample and add that to relay
                new_sample.append(game.get_feature_map(turn)) # cur_state
                new_sample.append(game.get_mask(turn)) # cur_mask
                new_sample.append(actor.get_action_prob(game, turn)) # cur_action
                new_sample.append(cur_action_mask) # cur_action_mask

                game.move_to(turn, pos0, pos1) # do the move here if the move is valid
                turn = -turn # the turn now is the next turn, what we are learning for is -turn

                new_sample.append(game.evaluate(-turn, turn, max_score, min_score)) # cur_reward
                if abs(new_sample[-1]) == max_score:
                    new_sample.append(None) 
                else:
                    new_sample.append(game.get_feature_map(turn)) # new_state

                relay.add_sample(new_sample)

            if relay.size() > BATCH_SIZE:
                for t in range(step_per_episode):
                    cur_state, cur_mask, cur_action, cur_action_mask, \
                        cur_reward, new_state = relay.get_batch(BATCH_SIZE)
                    q_target = np.zeros(BATCH_SIZE)

                    for k in range(BATCH_SIZE):
                        if new_state[k] is None:
                            q_target[k] = cur_reward[k]
                        else:
                            # generate new action
                            board = new_state[k][0] - new_state[k][1]
                            tmp_game = Ataxx(board) # recover the game board

                            new_action, new_action_mask = actor.get_noised_action(tmp_game, 1, epsilon=0, return_position=False, 
                                                                        mask_init=policy_mask_init, is_target=True, verbose=False)
                            q_new = critic.target_model.predict([new_state[k].reshape(1, 2, 7, 7), 
                                                                new_action.reshape(1, 2, 7, 7),
                                                                new_action_mask.reshape(1, 2, 7, 7)])[0]
                            q_target[k] = cur_reward[k] - GAMMA*q_new # cuz this q_new is for the opponent
                    if train_critic:
                        yield [cur_state, cur_action, cur_action_mask], q_target
                    else:
                        yield [cur_state, cur_mask, cur_action_mask], np.zeros(BATCH_SIZE)

def actor_critic_trainer(epochs=1000, check_point=True, steps_per_epoch=50, \
                           BATCH_SIZE=64, TRAIN_BATCH_TIME=1, GAMMA=0.99, RELAY_SIZE=10000, TAU=0.001, LRA=0.0001, LRC=0.0001): 
    actor = ActorNetwork(TAU, LRA, 0)
    critic = CriticNetwork(TAU, LRC)
    relay = Relay(RELAY_SIZE)
    global graph
    graph = tf.get_default_graph()

    with tf.device('/cpu'):
        actor_critic_model = Model(inputs=[actor.model.inputs[0], actor.model.inputs[1], critic.frozen_model.inputs[2]],
                                   outputs=[critic.frozen_model([actor.model.inputs[0], actor.model.outputs[0], critic.frozen_model.inputs[2]])])
        sgd = SGD(lr=LRA, decay=5e-7, momentum=0.9, nesterov=True)
        actor_critic_model.compile(loss=neg_linear_loss, optimizer=sgd)
    gpu_ac_model = multi_gpu_model(actor_critic_model, gpus=4)
    gpu_ac_model.compile(loss=neg_linear_loss, optimizer=Adam(lr=LRA))
    print(gpu_ac_model.summary())
    
    for ep in range(epochs):
        critic.gpu_model.fit_generator(reinforcement_data_generator(actor.model, actor.target_model, \
                                                                    critic.model, critic.target_model, \
                                                                    relay, train_critic=True, \
                                                                    step_per_episode=50, \
                                                                    BATCH_SIZE=256, GAMMA=0.99, RELAY_SIZE=RELAY_SIZE),\
                                       verbose=1, steps_per_epoch=steps_per_epoch, \
                                       #workers=8, use_multiprocessing=True
                                      )
        gpu_ac_model.fit_generator(reinforcement_data_generator(actor.model, actor.target_model, \
                                                                critic.model, critic.target_model, \
                                                                relay, train_critic=False, \
                                                                step_per_episode=50, \
                                                                BATCH_SIZE=256, GAMMA=0.99, RELAY_SIZE=RELAY_SIZE),\
                                   verbose=1, steps_per_epoch=steps_per_epoch, \
                                   #workers=8, use_multiprocessing=True
                                  )
        actor.target_train()
        critic.target_train()
        
        if check_point:
            actor.save()
        tmp_game = Ataxx()
        print("trained action prob map predicted by initial model for a starting game")
        print(actor.get_action_prob(tmp_game, -1, is_target=True))

In [51]:
actor_critic_trainer(epochs=1000, check_point=True, steps_per_epoch=50, \
                     BATCH_SIZE=64, TRAIN_BATCH_TIME=1, GAMMA=0.99, RELAY_SIZE=10000, \
                     TAU=0.001, LRA=0.0001, LRC=0.0001) 
    



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_153 (InputLayer)          (None, 2, 7, 7)      0                                            
__________________________________________________________________________________________________
conv2d_242 (Conv2D)             (None, 64, 5, 5)     1216        input_153[0][0]                  
__________________________________________________________________________________________________
batch_normalization_271 (BatchN (None, 64, 5, 5)     256         conv2d_242[0][0]                 
__________________________________________________________________________________________________
conv2d_243 (Conv2D)             (None, 64, 5, 5)     36928       batch_normalization_271[0][0]    
__________________________________________________________________________________________________
batch_norm

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_161 (InputLayer)          (None, 2, 7, 7)      0                                            
__________________________________________________________________________________________________
input_162 (InputLayer)          (None, 2, 7, 7)      0                                            
__________________________________________________________________________________________________
multiply_64 (Multiply)          (None, 2, 7, 7)      0           input_161[0][0]                  
                                                                 input_162[0][0]                  
__________________________________________________________________________________________________
input_160 (InputLayer)          (None, 2, 7, 7)      0                                            
__________



Exception ignored in: <generator object reinforcement_data_generator at 0x7f765f58e468>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic preparedEpoch 1/1



Exception ignored in: <generator object reinforcement_data_generator at 0x7f765f58e468>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.020408  ]
  [ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.020408  ]
  [ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.020408  ]
  [ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.020408  ]
  [ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.020408  ]
  [ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.020408  ]
  [ 0.020408    0.020408    0.020408    0.020408    0.020408    0.020408
    0.02041597]]

 [[ 0.02040417  0.02040417  0.02040417  0.02040417  0.02040417  0.02040417
    0.02040417]
  [ 0.02042603  0.02042381  0.02041361  0.02040417  0.02040417  0.02040417
    0.02040417]
  [ 0.02040417  0.02042197  0.02040417  0.02040417  0.02040417  0.02040417
    0.02040417]
  [ 0.02040417  0.02040417  0.0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.02040799]
  [ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.02040799]
  [ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.02040799]
  [ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.02040799]
  [ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.02040799]
  [ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.02040799]
  [ 0.02040799  0.02040799  0.02040799  0.02040799  0.02040799  0.02040799
    0.0204167 ]]

 [[ 0.02040579  0.02040579  0.0204107   0.02040579  0.02040579  0.02040579
    0.02040579]
  [ 0.020414    0.02040579  0.02042309  0.02040579  0.02040579  0.02040579
    0.02040579]
  [ 0.02040579  0.02040579  0.02040579  0.02040579  0.02040579  0.02040579
    0.02040579]
  [ 0.02040579  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic preparedEpoch 1/1



Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02040795]
  [ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02040795]
  [ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02040795]
  [ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02040795]
  [ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02040795]
  [ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02040795]
  [ 0.02040795  0.02040795  0.02040795  0.02040795  0.02040795  0.02040795
    0.02041829]]

 [[ 0.02040545  0.02040545  0.02043251  0.02040545  0.02040545  0.02040545
    0.02040545]
  [ 0.02040545  0.02040545  0.02042651  0.02040545  0.02040545  0.02040545
    0.02040545]
  [ 0.02040545  0.02040545  0.02040961  0.02040545  0.02040545  0.02040545
    0.02040545]
  [ 0.02040545  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02040797]
  [ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02040797]
  [ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02040797]
  [ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02040797]
  [ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02040797]
  [ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02040797]
  [ 0.02040797  0.02040797  0.02040797  0.02040797  0.02040797  0.02040797
    0.02041747]]

 [[ 0.02040438  0.02040438  0.02045121  0.02040438  0.02040438  0.02040438
    0.02040438]
  [ 0.02040438  0.02040438  0.02042737  0.02040438  0.02040438  0.02040438
    0.02040438]
  [ 0.02040438  0.02040438  0.02041601  0.02040438  0.02040438  0.02040438
    0.02040438]
  [ 0.02040438  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic preparedEpoch 1/1



Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02040808]
  [ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02040808]
  [ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02040808]
  [ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02040808]
  [ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02040808]
  [ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02040808]
  [ 0.02040808  0.02040808  0.02040808  0.02040808  0.02040808  0.02040808
    0.02041208]]

 [[ 0.02040343  0.02040343  0.0204699   0.02040343  0.02040343  0.02040343
    0.02040343]
  [ 0.02040343  0.02040343  0.02042824  0.02040343  0.02040343  0.02040343
    0.02040343]
  [ 0.02040343  0.02040343  0.02042096  0.02040343  0.02040343  0.02040343
    0.02040343]
  [ 0.02040343  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02041005  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]
  [ 0.02040812  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]
  [ 0.02040812  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]
  [ 0.02040812  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]
  [ 0.02040812  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]
  [ 0.02040812  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]
  [ 0.02040812  0.02040812  0.02040812  0.02040812  0.02040812  0.02040812
    0.02040812]]

 [[ 0.02040233  0.02040233  0.02048798  0.02040233  0.02040233  0.02040233
    0.02040233]
  [ 0.02040233  0.02040233  0.02042876  0.02040233  0.02040233  0.02040233
    0.02040233]
  [ 0.02040233  0.02040233  0.02042454  0.02040233  0.02040233  0.02040233
    0.02040233]
  [ 0.02040233  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02041354  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]
  [ 0.02040805  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]
  [ 0.02040805  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]
  [ 0.02040805  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]
  [ 0.02040805  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]
  [ 0.02040805  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]
  [ 0.02040805  0.02040805  0.02040805  0.02040805  0.02040805  0.02040805
    0.02040805]]

 [[ 0.02040141  0.02040141  0.02050594  0.02040141  0.02040141  0.02040141
    0.02040141]
  [ 0.02040141  0.02040141  0.02043002  0.02040141  0.02040141  0.02040141
    0.02040141]
  [ 0.02040141  0.02040141  0.02042576  0.02040141  0.02040141  0.02040141
    0.02040141]
  [ 0.02040141  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02041675  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]
  [ 0.02040798  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]
  [ 0.02040798  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]
  [ 0.02040798  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]
  [ 0.02040798  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]
  [ 0.02040798  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]
  [ 0.02040798  0.02040798  0.02040798  0.02040798  0.02040798  0.02040798
    0.02040798]]

 [[ 0.02040056  0.02040056  0.02052318  0.02040056  0.02040056  0.02040056
    0.02040056]
  [ 0.02040056  0.02040056  0.02043022  0.02040056  0.02040056  0.02040056
    0.02040056]
  [ 0.02040056  0.02040056  0.02042694  0.02040056  0.02040056  0.02040056
    0.02040056]
  [ 0.02040056  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02041998  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]
  [ 0.02040792  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]
  [ 0.02040792  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]
  [ 0.02040792  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]
  [ 0.02040792  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]
  [ 0.02040792  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]
  [ 0.02040792  0.02040792  0.02040792  0.02040792  0.02040792  0.02040792
    0.02040792]]

 [[ 0.02039975  0.02039975  0.02053815  0.02039975  0.02039975  0.02039975
    0.02039975]
  [ 0.02039975  0.02039975  0.02043092  0.02039975  0.02039975  0.02039975
    0.02039975]
  [ 0.02039975  0.02039975  0.02042914  0.02039975  0.02039975  0.02039975
    0.02039975]
  [ 0.02039975  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic preparedEpoch 1/1



Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.0204231   0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]
  [ 0.02040785  0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]
  [ 0.02040785  0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]
  [ 0.02040785  0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]
  [ 0.02040785  0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]
  [ 0.02040785  0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]
  [ 0.02040785  0.02040785  0.02040785  0.02040785  0.02040785  0.02040785
    0.02040785]]

 [[ 0.02039894  0.02040579  0.0205518   0.02039894  0.02039894  0.02039894
    0.02039894]
  [ 0.02039894  0.02039894  0.02043077  0.02039894  0.02039894  0.02039894
    0.02039894]
  [ 0.02039894  0.02039894  0.02042754  0.02039894  0.02039894  0.02039894
    0.02039894]
  [ 0.02039894  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic preparedEpoch 1/1



Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02042691  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]
  [ 0.02040777  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]
  [ 0.02040777  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]
  [ 0.02040777  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]
  [ 0.02040777  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]
  [ 0.02040777  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]
  [ 0.02040777  0.02040777  0.02040777  0.02040777  0.02040777  0.02040777
    0.02040777]]

 [[ 0.02039819  0.02041248  0.02056349  0.02039819  0.02039819  0.02039819
    0.02039819]
  [ 0.02039819  0.02039819  0.0204301   0.02039819  0.02039819  0.02039819
    0.02039819]
  [ 0.02039819  0.02039819  0.02042566  0.02039819  0.02039819  0.02039819
    0.02039819]
  [ 0.02039819  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.0204308   0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]
  [ 0.02040769  0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]
  [ 0.02040769  0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]
  [ 0.02040769  0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]
  [ 0.02040769  0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]
  [ 0.02040769  0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]
  [ 0.02040769  0.02040769  0.02040769  0.02040769  0.02040769  0.02040769
    0.02040769]]

 [[ 0.02039758  0.02041801  0.02057318  0.02039758  0.02039758  0.02039758
    0.02039758]
  [ 0.02039758  0.02039758  0.02042926  0.02039758  0.02039758  0.02039758
    0.02039758]
  [ 0.02039758  0.02039758  0.02042391  0.02039758  0.02039758  0.02039758
    0.02039758]
  [ 0.02039758  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02043401  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]
  [ 0.02040763  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]
  [ 0.02040763  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]
  [ 0.02040763  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]
  [ 0.02040763  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]
  [ 0.02040763  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]
  [ 0.02040763  0.02040763  0.02040763  0.02040763  0.02040763  0.02040763
    0.02040763]]

 [[ 0.02039705  0.02042411  0.02058309  0.02039705  0.02039705  0.02039705
    0.02039705]
  [ 0.02039705  0.02039705  0.02042831  0.02039705  0.02039705  0.02039705
    0.02039705]
  [ 0.02039705  0.02039705  0.02042199  0.02039705  0.02039705  0.02039705
    0.02039705]
  [ 0.02039705  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02043587  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]
  [ 0.02040759  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]
  [ 0.02040759  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]
  [ 0.02040759  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]
  [ 0.02040759  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]
  [ 0.02040759  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]
  [ 0.02040759  0.02040759  0.02040759  0.02040759  0.02040759  0.02040759
    0.02040759]]

 [[ 0.02039659  0.02043026  0.020593    0.02039659  0.02039659  0.02039659
    0.02039659]
  [ 0.02039659  0.02039659  0.02042667  0.02039659  0.02039659  0.02039659
    0.02039659]
  [ 0.02039659  0.02039659  0.02042037  0.02039659  0.02039659  0.02039659
    0.02039659]
  [ 0.02039659  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.0204375   0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]
  [ 0.02040755  0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]
  [ 0.02040755  0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]
  [ 0.02040755  0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]
  [ 0.02040755  0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]
  [ 0.02040755  0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]
  [ 0.02040755  0.02040755  0.02040755  0.02040755  0.02040755  0.02040755
    0.02040755]]

 [[ 0.02039614  0.02043639  0.02060285  0.02039614  0.02039614  0.02039614
    0.02039614]
  [ 0.02039614  0.02039614  0.02042501  0.02039614  0.02039614  0.02039614
    0.02039614]
  [ 0.02039614  0.02039614  0.02041861  0.02039614  0.02039614  0.02039614
    0.02039614]
  [ 0.02039614  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic preparedEpoch 1/1



Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02043931  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]
  [ 0.02040751  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]
  [ 0.02040751  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]
  [ 0.02040751  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]
  [ 0.02040751  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]
  [ 0.02040751  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]
  [ 0.02040751  0.02040751  0.02040751  0.02040751  0.02040751  0.02040751
    0.02040751]]

 [[ 0.02039566  0.02044197  0.02061351  0.02039566  0.02039566  0.02039566
    0.02039566]
  [ 0.02039566  0.02039566  0.02042458  0.02039566  0.02039566  0.02039566
    0.02039566]
  [ 0.02039566  0.02039566  0.02041656  0.02039566  0.02039566  0.02039566
    0.02039566]
  [ 0.02039566  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02044123  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]
  [ 0.02040748  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]
  [ 0.02040748  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]
  [ 0.02040748  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]
  [ 0.02040748  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]
  [ 0.02040748  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]
  [ 0.02040748  0.02040748  0.02040748  0.02040748  0.02040748  0.02040748
    0.02040748]]

 [[ 0.02039513  0.0204482   0.0206238   0.02039513  0.02039513  0.02039513
    0.02039513]
  [ 0.02039513  0.02039513  0.02042557  0.02039513  0.02039513  0.02039513
    0.02039513]
  [ 0.02039513  0.02039513  0.0204139   0.02039513  0.02039513  0.02039513
    0.02039513]
  [ 0.02039513  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b048>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.0204425   0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]
  [ 0.02040745  0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]
  [ 0.02040745  0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]
  [ 0.02040745  0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]
  [ 0.02040745  0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]
  [ 0.02040745  0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]
  [ 0.02040745  0.02040745  0.02040745  0.02040745  0.02040745  0.02040745
    0.02040745]]

 [[ 0.02039455  0.02045457  0.02063477  0.02039455  0.02039455  0.02039455
    0.02039455]
  [ 0.02039455  0.02039455  0.02042698  0.02039455  0.02039455  0.02039455
    0.02039455]
  [ 0.02039455  0.02039455  0.02041113  0.02039455  0.02039455  0.02039455
    0.02039455]
  [ 0.02039455  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02044372  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]
  [ 0.02040742  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]
  [ 0.02040742  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]
  [ 0.02040742  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]
  [ 0.02040742  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]
  [ 0.02040742  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]
  [ 0.02040742  0.02040742  0.02040742  0.02040742  0.02040742  0.02040742
    0.02040742]]

 [[ 0.02039393  0.02046089  0.02064561  0.02039393  0.02039393  0.02039393
    0.02039393]
  [ 0.02039731  0.02039393  0.02042882  0.02039393  0.02039393  0.02039393
    0.02039393]
  [ 0.02039393  0.02039393  0.02040839  0.02039393  0.02039393  0.02039393
    0.02039393]
  [ 0.02039393  0

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1


Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b1a8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


trained action prob map predicted by initial model for a starting game
[[[ 0.02044491  0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]
  [ 0.0204074   0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]
  [ 0.0204074   0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]
  [ 0.0204074   0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]
  [ 0.0204074   0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]
  [ 0.0204074   0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]
  [ 0.0204074   0.0204074   0.0204074   0.0204074   0.0204074   0.0204074
    0.0204074 ]]

 [[ 0.02039324  0.02046709  0.02065576  0.02039324  0.02039324  0.02039324
    0.02039324]
  [ 0.02040218  0.02039324  0.02043024  0.02039324  0.02039324  0.02039324
    0.02039324]
  [ 0.02039324  0.02039324  0.02040604  0.02039324  0.02039324  0.02039324
    0.02039324]
  [ 0.02039324  0.020393

Exception ignored in: <generator object reinforcement_data_generator at 0x7f7653a4b0f8>
Traceback (most recent call last):
  File "<ipython-input-50-b196c6d99be7>", line 67, in reinforcement_data_generator
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/contextlib.py", line 77, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/yuze/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 4342, in get_controller
    if self.stack[-1] is not default:
IndexError: list index out of range


actor critic prepared
Epoch 1/1

KeyboardInterrupt: 