In [None]:
pip install gym_super_mario_bros==7.3.0

Collecting gym_super_mario_bros==7.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/a0/b8/07460212c2568f78b02995834e7bdc25349e586473919e2983e01b984abf/gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 8.9MB/s 
[?25hCollecting nes-py>=8.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/00/98/f87eacc9ff3ddfe97ecc889165119317cd4782f5839c24b39f88a1a7e7d7/nes_py-8.1.1.tar.gz (74kB)
[K     |████████████████████████████████| 81kB 9.8MB/s 
Building wheels for collected packages: nes-py
  Building wheel for nes-py (setup.py) ... [?25l[?25hdone
  Created wheel for nes-py: filename=nes_py-8.1.1-cp36-cp36m-linux_x86_64.whl size=449927 sha256=ef8a227225513e404942ce5ef87bbc2371aab2211be3f5b1e180b043ddfca009
  Stored in directory: /root/.cache/pip/wheels/04/d7/e4/0949e4c8947993c5555730a3b15f3cdc5a86507b95388dd608
Successfully built nes-py
Installing collected packages: nes-py, gym-super-mario-bros
Successfully ins

In [None]:
import numpy as np
import os
os.environ.setdefault('PATH', '')
from collections import deque
import gym
from gym import spaces
import cv2
cv2.ocl.setUseOpenCL(False)


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condition for a few frames
            # so it's important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
        self.width = width
        self.height = height
        self.grayscale = grayscale
        if self.grayscale:
            self.observation_space = spaces.Box(low=0, high=255,
                shape=(self.height, self.width, 1), dtype=np.uint8)
        else:
            self.observation_space = spaces.Box(low=0, high=255,
                shape=(self.height, self.width, 3), dtype=np.uint8)

    def observation(self, frame):
        if self.grayscale:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        if self.grayscale:
            frame = np.expand_dims(frame, -1)
        return frame


class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not believe how complex the previous solution was."""
        self._frames = frames
        self._out = None

    def _force(self):
        if self._out is None:
            self._out = np.concatenate(self._frames, axis=-1)
            self._frames = None
        return self._out

    def __array__(self, dtype=None):
        out = self._force()
        if dtype is not None:
            out = out.astype(dtype)
        return out

    def __len__(self):
        return len(self._force())

    def __getitem__(self, i):
        return self._force()[i]


class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))


def wrapper(env):
    """Apply a common set of wrappers for Atari games."""
    #env = EpisodicLifeEnv(env)
    #env = NoopResetEnv(env, noop_max=10)
    env = MaxAndSkipEnv(env, skip=4)
    if 'FIRE' in env.unwrapped.get_action_meanings():
       env = FireResetEnv(env)
    env = WarpFrame(env)
    env = FrameStack(env, 4)
    #env = ClipRewardEnv(env)
    return env

In [None]:
import time
import random
import numpy as np
from collections import deque
import tensorflow as tf
import matplotlib.pyplot as plt


import datetime
import numpy as np
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY

from tensorflow.keras.layers import Conv2D, Flatten, Dense, Input, LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def parameters_defenition():
    parameters = dict()
    parameters['epsilon_decay'] = 1/90
    parameters['learning_rate'] = 0.0005#0.00025
    parameters['first_layer_size'] = 200
    parameters['second_layer_size'] = 100
    parameters['third_layer_size'] = 50
    parameters['output_dim'] = 4
    parameters['output_activation'] = 'sigmoid'
    parameters['episodes_to_play'] = 150
    parameters['memory_size'] = 100000
    parameters['memory_batch'] = 500
    parameters['train'] = True
    parameters['environment'] = 'hungrysnek-raw-16-v1'
    parameters['render'] = False
    #parameters['online_weights_name'] = 'online_weights' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5'
    #parameters['target_weights_name'] = 'target_weights' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5'
    parameters['console_log'] = True
    return parameters


def cuda_memgrowth():
    import tensorflow as tf
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(
                logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)


def draw_graph(reward, name):
    plt.plot(np.asarray(reward))
    plt.title(name)
    plt.show()



class DQNagent:
    def __init__(self, parameters, input_dim, actions):
        self.input_dim = input_dim
        self.actions = actions
        #self.session = tf.Session()
        self.learning_rate = parameters['learning_rate']
        self.model_online = self.neural_network_online()
        self.model_target = self.neural_network_target()

        #self.saver = tf.train.Saver(max_to_keep=10)
        #self.session.run(tf.global_variables_initializer())
        #self.saver = tf.train.Saver()
        self.memory = deque(maxlen=parameters['memory_size']);
        self.eps = 1
        self.eps_decay = 0.000200#0.000125#0.99999975#0.99999575
        self.eps_min = 0.1
        self.gamma = 0.90
        self.batch_size = 64
        self.burnin = 100000
        self.copy = 10000
        self.step = 0
        self.learn_each = 3
        self.learn_step = 0
        self.save_each = 200000
        #self.double_q = double_q
        self.flag_reached = 0
        self.opt = tf.optimizers.Adam(lr=self.learning_rate, )
        self.episodes_to_play = 6000


    '''def neural_network_online(self):
        input_img = Input(shape=(self.input_dim))
        img = Conv2D(filters = 32, kernel_size = [8,8], strides = [4,4], activation = 'relu')(input_img)
        img = Conv2D(filters = 64, kernel_size = [4,4], strides = [2,2], activation = 'relu')(img)
        img = Conv2D(filters = 64, kernel_size = [3,3], strides = [1,1], activation = 'relu')(img)
        img = Flatten()(img)
        img = Dense(512, activation = 'relu')(img)
        img = Dense(self.actions)(img)

        model = Model(inputs=input_img, outputs=img)
        #loss = tf.losses.huber_loss(labels=self.q_true, predictions=self.q_pred)
        #model.compile(loss = 'mse', optimizer = Adam(self.learning_rate))
        model.compile(loss = 'huber_loss', optimizer = Adam(self.learning_rate))
        return model'''


    def neural_network_online(self):
        input_img = Input(shape=(self.input_dim))
        img = Conv2D(filters = 32, kernel_size = [8,8], strides = [4,4])(input_img)
        img = LeakyReLU(0.01)(img)
        img = Conv2D(filters = 64, kernel_size = [4,4], strides = [2,2])(img)
        img = LeakyReLU(0.01)(img)
        img = Conv2D(filters = 64, kernel_size = [3,3], strides = [1,1])(img)
        img = LeakyReLU(0.01)(img)
        img = Flatten()(img)
        img = Dense(512)(img)
        img = LeakyReLU(0.01)(img)
        img = Dense(self.actions)(img)

        model = Model(inputs=input_img, outputs=img)
        #loss = tf.losses.huber_loss(labels=self.q_true, predictions=self.q_pred)
        #model.compile(loss = 'mse', optimizer = Adam(self.learning_rate))
        model.compile(loss = 'huber_loss', optimizer = Adam(self.learning_rate))
        return model


    '''def neural_network_target(self):
        input_img = Input(shape=(self.input_dim))
        img = Conv2D(filters = 32, kernel_size = [8,8], strides = [4,4], activation = 'relu')(input_img)
        img = Conv2D(filters = 64, kernel_size = [4,4], strides = [2,2], activation = 'relu')(img)
        img = Conv2D(filters = 64, kernel_size = [3,3], strides = [1,1], activation = 'relu')(img)
        img = Flatten()(img)
        img = Dense(512, activation = 'relu')(img)
        img = Dense(self.actions)(img)

        model = Model(inputs=input_img, outputs=img)
        return model'''



    def neural_network_target(self):
        input_img = Input(shape=(self.input_dim))
        img = Conv2D(filters = 32, kernel_size = [8,8], strides = [4,4])(input_img)
        img = LeakyReLU(0.01)(img)
        img = Conv2D(filters = 64, kernel_size = [4,4], strides = [2,2])(img)
        img = LeakyReLU(0.01)(img)
        img = Conv2D(filters = 64, kernel_size = [3,3], strides = [1,1])(img)
        img = LeakyReLU(0.01)(img)
        img = Flatten()(img)
        img = Dense(512)(img)
        img = LeakyReLU(0.01)(img)
        img = Dense(self.actions)(img)

        model = Model(inputs=input_img, outputs=img)
        return model



    def update_memory(self, experience):
        self.memory.append(experience)


    def run(self, state):
       """ Perform action """
       if np.random.rand() < self.eps:
           # Random action
           action = np.random.randint(low=0, high=self.actions)
       else:
           # Policy action
           predict_online = self.model_online((np.expand_dims(state, 0)).astype('float32')/255.)
           #z = np.asarray(predict_online)
           action_online = np.argmax(predict_online)
           action = action_online
       # Decrease eps
       #self.eps = self.eps - self.eps_decay
       #self.eps = max(self.eps_min, self.eps)
       #print(self.eps)
       # Increment step
       self.step += 1
       return action


    def copy_model(self):
        '''for t, e in zip(self.model_target.trainable_variables, self.model_online.trainable_variables):
            #t.assign(t * (1 - TAU) + e * TAU) #soft?????
            t.assign(e)'''
        self.model_target.set_weights(self.model_online.get_weights())


    def save_weights(self):
        online_weights_name = 'leaky32_online_weights' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5'
        target_weights_name = 'leaky32_target_weights' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5'
        self.model_online.save_weights(online_weights_name)
        self.model_target.save_weights(target_weights_name)
        print('Online model was saved as ' + online_weights_name)
        print('Target model was saved as ' + target_weights_name)


    def learn(self):
        """ Gradient descent """
        # Sync target network
        if self.step % self.copy == 0:
            self.copy_model()
        # Checkpoint model
        if self.step % self.save_each == 0:
            self.save_weights()
        # Break if burn-in
        if self.step < self.burnin:
            return
        # Break if no training
        if self.learn_step < self.learn_each:
            self.learn_step += 1
            return
        # Sample batch
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(np.array, zip(*batch))
        # Get next q values from target network
        #next_predict_target = self.model_target(next_state.astype('float32'))

        state = state/255.
        next_state=next_state/255.

        ###################################
        '''target = reward
        target = reward + (1. - done) * self.gamma * np.amax(self.model_online(next_state.astype('float32')/255.))
        target_expected1 = self.model_online(state.astype('float32')/255.)
        target_expected = np.copy(np.asarray(target_expected1))
        for i in range(self.batch_size):
            target_expected[i][action[i]] = target[i]
        #target_expected[:,action] = target
        self.model_online.train_on_batch(state/255., target_expected)'''
        ###################################

        ########################################################ANDREY
        dqn_variable = self.model_online.trainable_variables
        with tf.GradientTape() as tape:
            tape.watch(dqn_variable)

            reward = tf.convert_to_tensor(reward, dtype=tf.float32)
            action = tf.convert_to_tensor(action, dtype=tf.int32)
            done = tf.convert_to_tensor(np.array(done).astype(int), dtype=tf.float32)

            target_q = self.model_target(tf.convert_to_tensor(np.stack(next_state), dtype=tf.float32))
            main_q = self.model_online(tf.convert_to_tensor(np.stack(next_state), dtype=tf.float32))
            main_q = tf.stop_gradient(main_q)
            next_action = tf.argmax(main_q, axis=1)
            target_value = tf.reduce_sum(tf.one_hot(next_action, self.actions) * target_q, axis=1)

            target_value = (1-done) * self.gamma * target_value + reward

            main_q = self.model_online(tf.convert_to_tensor(np.stack(state), dtype=tf.float32))
            main_value = tf.reduce_sum(tf.one_hot(action, self.actions) * main_q, axis=1)

            error = tf.square(main_value - target_value) * 0.5
            error = tf.reduce_mean(error)

        dqn_grads = tape.gradient(error, dqn_variable)
        self.opt.apply_gradients(zip(dqn_grads, dqn_variable))


        ################################################# DOUBLE Q LEARNING
        '''qnet_values = self.model_online(next_state.astype('float32')/255.)
        qnet_actions = np.argmax((qnet_values), axis=1)

        tnet_q_values = self.model_target(next_state.astype('float32')/255.)
        tnet_q = [np.take(tnet_q_values[i], qnet_actions[i]) for i in range(self.batch_size)]
        qnet_update_q = [r+0.90*q if not d else r for r, q, d in zip(reward, tnet_q, done)]

        target_q = np.copy(np.asarray(qnet_values))
        batch_idxs = np.arange(self.batch_size)
        target_q[batch_idxs, action] = qnet_update_q
        self.model_online.train_on_batch(state/255., target_q)'''
        #######################################################

        '''qnet_values = self.model_online(next_state.astype('float32'))
        qnet_actions = np.argmax((qnet_values), axis=1)

        tnet_q_values = self.model_target(next_state.astype('float32'))
        tnet_q = [np.take(tnet_q_values[i], qnet_actions[i]) for i in range(self.batch_size)]
        qnet_update_q = [r+0.90*q if not d else r for r, q, d in zip(reward, tnet_q, done)]

        target_q = np.copy(np.asarray(qnet_values))
        batch_idxs = np.arange(self.batch_size)
        target_q[batch_idxs, action] = qnet_update_q'''

        '''# Calculate discounted future reward
        if True: #self.double_q:
            next_predict_online = self.model_online(next_state.astype('float32'))
            next_action_online = np.argmax(next_predict_online, axis=1)
            #target_q = reward + (1. - done) * self.gamma * next_predict_target[np.arange(0, self.batch_size), next_action_online]
            #target_q = np.zeros(32,5)

            target_q = np.copy(np.asarray(next_predict_target))
            for i in range(self.batch_size):
               target_q[i][next_action_online[i]] = reward[i] + (1. - done[i]) * self.gamma * next_predict_target[i, next_action_online[i]] #????????????
               #next_predict_target[i][next_action_online[i]] = reward[i] + (1. - done) * self.gamma * next_predict_target[i, next_action_online[i]]'''

        #update model
        #self.model_online.train_on_batch(state, target_q)
        # Reset learn step
        #self.check()
        self.learn_step = 0


    def reward_func(self, info, reward, done):
        if info['flag_get'] == True:
            #self.flag_reached = self.flag_reached + 1
            return reward+2
        if done == True:
            return reward-10.


        return reward


    def check(self):
        done = False
        env_test = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
        env_test = JoypadSpace(env_test, RIGHT_ONLY)
        env_test = wrapper(env_test)
        env_test.reset()
        state, reward, done, info = env_test.step(0)
        while done != True:
            action = np.argmax(self.model_online((np.expand_dims(state, 0)).astype('float32')/255.))
            state, reward, done, info = env_test.step(action)
        if info['x_pos'] >= 3160:
            agent.model_online.save_weights('onlineFULL' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5')
            agent.model_target.save_weights('targetFULL' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5')

        print('Checked ' + str(info['x_pos']))

        env_test.close()



    def model_test(self, env):
        done = True
        zz = 0
        for step in range(5000):
            if done:
                state = env.reset()
            #opa = np.asarray(state)
            #opa = opa/255.
            action = np.argmax(self.model_online((np.expand_dims(state, 0)).astype('float32')/255.))
            #action = np.argmax(self.model_online((np.expand_dims(opa, 0))))
            print(self.model_online((np.expand_dims(state, 0)).astype('float32')/255.))
            state, reward, done, info = env.step(action)
            #reward = self.reward_func(info, reward, done)
            z = info['x_pos']
            zz = reward + zz
            env.render()

def train_model(parameters):
    # Build env (first level, right only)
    #env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0')
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = wrapper(env)

    # Parameters
    states = (84, 84, 4)
    actions = env.action_space.n

    # Agent
    agent = DQNagent(parameters, states, actions)

    #############################################################
    #agent.model_online.load_weights('/content/continue_leaky32_online_weights20200613-200838.hdf5')
    #agent.model_target.load_weights('/content/continue_leaky32_target_weights20200613-200838.hdf5')
    #agent.model_test(env)
    #agent.eps = 0.0
    maxXpos = 0
    max_reward = 0
    start1 = time.time()
    graph_reward = np.zeros(6000)
    graph_pos = np.zeros(6000)
    graph_mean_reward = np.zeros(6000)
    #epsilon decay, batchsize, burnin
    #############################################################

    # Episodes
    episodes = 6000
    rewards = []

    # Timing
    start = time.time()
    step = 0

    # Main loop
    for e in range(episodes):


        # Reset env
        state = env.reset()

        # Reward
        total_reward = 0
        iter = 0

        # Play
        while True:

            # Show env
            # env.render()

            # Run agent
            action = agent.run(state)

            # Perform action
            next_state, reward, done, info = env.step(action)

            #reward = agent.reward_func(info, reward, done)

            # Remember
            agent.update_memory(experience=(state, next_state, action, reward, done))


            #agent.model_online.summary()
            #agent.model_target().summary()

            # Replay
            agent.learn()

            # Total reward
            total_reward += reward

            # Update state
            state = next_state

            # Increment
            iter += 1
            '''if agent.step == agent.burnin:
                agent.eps = 0.5119'''

            #time.sleep(0.05)
            #env.render()

            # If done break loop
            if done or info['flag_get']:
                #print(total_reward)
                break

        # Rewards
        rewards.append(total_reward / iter)

        if maxXpos < info['x_pos']:
            maxXpos = info['x_pos']
        if max_reward < total_reward:
            max_reward = total_reward
        if info['flag_get'] == True:
            agent.flag_reached = agent.flag_reached + 1

        #eps decay
        agent.eps = agent.eps - agent.eps_decay

        if agent.eps <= 0 and info['x_pos'] == 3161:
          agent.model_online.save_weights('leaky64_online_weights_full' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5')
          agent.model_target.save_weights('leaky64_target_weights_full' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.hdf5')

        #graph variables
        graph_reward[e] = total_reward
        graph_pos[e] = info['x_pos']
        graph_mean_reward[e] = np.mean(graph_reward)


        print("Episode reward: " + str(total_reward) + ' - Pos: ' + str(info['x_pos']))
        # Print
        if e % 10 == 0:
            end = time.time()
            print('Flags reached: ' + str(agent.flag_reached) + ' - Max reward: ' +str(max_reward))
            print('Episode {e} - '
                  'Frame {f} - '
                  'Frames/sec {fs} - '
                  'Epsilon {eps} - '
                  'Mean Reward {r} - '
                  'Time {t} sec - '
                  'Max pos {pos}'.format(e=e,
                                           f=agent.step,
                                           fs=np.round((agent.step - step) / (time.time() - start)),
                                           eps=np.round(agent.eps, 4),
                                           r=np.mean(rewards[-100:]),
                                           t=round(end - start1),
                                           pos=maxXpos))

            start = time.time()
            step = agent.step
    draw_graph(graph_reward,'Rewards')
    draw_graph(graph_pos, 'Position')
    draw_graph(graph_mean_reward, 'Mean reward')
    agent.save_weights()
    # Save rewards
   # np.save('rewards.npy', rewards)'''

if __name__ == '__main__':
    cuda_memgrowth()
    parameters = parameters_defenition()
    train_model(parameters)

1 Physical GPUs, 1 Logical GPUs


  return (self.ram[0x86] - self.ram[0x071c]) % 256


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode reward: 232.0 - Pos: 298
Episode reward: 1269.0 - Pos: 1417
Episode reward: 246.0 - Pos: 310
Flags reached: 3 - Max reward: 2967.0
Episode 1520 - Frame 365524 - Frames/sec 39.0 - Epsilon 0.6958 - Mean Reward 5.606831557996079 - Time 7784 sec - Max pos 3161
Episode reward: 786.0 - Pos: 898
Episode reward: 238.0 - Pos: 301
Episode reward: 645.0 - Pos: 803
Episode reward: 995.0 - Pos: 1126
Episode reward: 784.0 - Pos: 898
Episode reward: 1011.0 - Pos: 1128
Episode reward: 606.0 - Pos: 677
Episode reward: 636.0 - Pos: 722
Episode reward: 611.0 - Pos: 687
Episode reward: 636.0 - Pos: 722
Flags reached: 3 - Max reward: 2967.0
Episode 1530 - Frame 367831 - Frames/sec 39.0 - Epsilon 0.6938 - Mean Reward 5.490519952057455 - Time 7843 sec - Max pos 3161
Episode reward: 803.0 - Pos: 898
Episode reward: 609.0 - Pos: 683
Episode reward: 1230.0 - Pos: 1432
Episode reward: 751.0 - Pos: 840
Episode reward: 1309.0 - Pos: 1437
Epis