In [1]:
import keras
import tensorflow as tf
import gym
import numpy as np
import sys
import matplotlib.pyplot as plt
import os
from collections import deque
import random

  for external in metadata.entry_points().get(self.group, []):


In [2]:
env = gym.make('LunarLander-v2')

In [3]:
print('state space : %s'%(env.observation_space))
print('action space : %s'%(env.action_space))

state space : Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)
action space : Discrete(4)


In [4]:
tf.config.list_physical_devices(
    device_type=None
)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


In [5]:
class ReplayBuffer():
    
    def __init__(self,n_memory):
        self.buffer = deque(maxlen = n_memory)

       
    
    def add_experience(self,state,action,reward,next_state,done):
        self.buffer.append((state,action,reward,next_state,done))
    
        
        
    
        
    def sample_experience(self,batch_size):
        sample_size = min(len(self.buffer),batch_size)
        sample = random.choices(self.buffer,k = sample_size)
        return map(np.array, zip(*sample))

In [6]:
class PrioritizedReplayBuffer():
    
    def __init__(self,n_memory):
        self.buffer = deque(maxlen = n_memory)
        self.priorities = deque(maxlen = n_memory)
        
    

       
    
    def add_experience(self,state,action,reward,next_state,done):
        self.buffer.append((state,action,reward,next_state,done))
        self.priorities.append(max(self.priorities,default = 1))
        
    def get_probabilities(self,priority_scale):
        scaled_priorities = np.array(self.priorities)**priority_scale
        sample_probabilities = scaled_priorities / sum(scaled_priorities)
        return sample_probabilities
    
    def get_importance(self,probabilities):
        importance = 1/len(self.buffer) * 1/probabilities
        importance_normalized = importance / max(importance)
        return importance_normalized
    
    def set_priorities(self,indices,errors,offset = 0.1):
        for i,e in zip(indices,errors):
            self.priorities[i] = e + offset
    
        
        
    
        
    def sample_experience(self,batch_size,priority_scale=0.7):
        sample_size = min(len(self.buffer),batch_size)
        sample_probs = self.get_probabilities(priority_scale)
        sample_indices = random.choices(range(len(self.buffer)),k = sample_size,weights = sample_probs)
        sample = np.array(self.buffer)[sample_indices]
        importance = self.get_importance(sample_probs[sample_indices])
        return map(np.array, zip(*sample)), importance, sample_indices 

In [7]:
class Qnetwork(keras.Model):
    def __init__(self, action_size,state_size,unit1,unit2):
        super(Qnetwork,self).__init__()
        self.dense1 = keras.layers.Dense(128,activation = 'elu',input_shape = [state_size])
        self.dense2 = keras.layers.Dense(128,activation = 'elu')
        self.dense3 = keras.layers.Dense(128,activation = 'elu')
        self.Q_values = keras.layers.Dense(action_size)
        
    def call(self,state):
        x = self.dense1(state)
        x = self.dense2(x)
        x = self.dense3(x)
        Q_values = self.Q_values(x)
        
        return Q_values
    
class DuelingQnetwork(keras.Model):
    def __init__(self, action_size,state_size,unit1,unit2):
        super(DuelingQnetwork,self).__init__()
        self.action_size = action_size
        self.state_size = state_size 
        
        self.value_stream = keras.Sequential([
            keras.layers.Dense(512,activation = 'relu',input_shape=[self.state_size]),
            keras.layers.Dense(256,activation = 'relu'),
            keras.layers.Dense(256,activation = 'relu'),
            keras.layers.Dense(1)
        ])
        
        self.advantage = keras.Sequential([
            keras.layers.Dense(512,activation = 'relu',input_shape=[self.state_size]),
            keras.layers.Dense(256,activation = 'relu'),
            keras.layers.Dense(256,activation = 'relu'),
            keras.layers.Dense(self.action_size)
        ])
        
    def call(self,state):
        values = self.value_stream(state)
        advantage = self.advantage(state)
        
        Q_values = values + (advantage-tf.reduce_mean(advantage))
        
        return Q_values

        

In [8]:
class DQNAgent():
    def __init__(self, memory_size,action_size,state_size,optimizer,loss_fn, unit1 = 256,unit2 = 256, gamma = 0.999):
        self.action_size = action_size
        self.state_size = state_size
        self.gamma = gamma
        self.ReplayBuffer = ReplayBuffer(memory_size)
        self.QnetworkLocal = DuelingQnetwork(action_size,state_size,unit1,unit2)
        self.QnetworkTarget = DuelingQnetwork(action_size,state_size,unit1,unit2)
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.QnetworkLocal.compile(optimizer= optimizer,loss = loss_fn)
        self.QnetworkTarget.set_weights(self.QnetworkLocal.get_weights())
        #self.importance = 1
        #self.epsilon = 1
        
    def remember(self,exp):
        state,action,reward,next_state,done = exp
        self.ReplayBuffer.add_experience(state,action,reward,next_state,done )
        
        
    def choose_action(self,state,epsilon):
        if np.random.rand()<epsilon:
            action = np.random.randint(self.action_size)
        else:
            Q_values = self.QnetworkLocal.predict(state[np.newaxis])
            action = np.argmax(Q_values)
        return action
    
    def learn(self,batch_size):
        exp = self.ReplayBuffer.sample_experience(batch_size)
        states,actions,rewards,next_states,dones = exp 
        next_Q_values = self.QnetworkLocal.predict(next_states)
        best_next_actions = np.argmax(next_Q_values,axis=1)
        next_mask = tf.one_hot(best_next_actions,action_size)
        next_best_Q_values = tf.reduce_sum(self.QnetworkTarget.predict(next_states)*next_mask,axis=1)
        target_Q_values = rewards + (1-dones)*self.gamma*next_best_Q_values
        mask = tf.one_hot(actions,self.action_size)
        with tf.GradientTape() as tape:
            Q_values = tf.reduce_sum(self.QnetworkLocal(states)*mask,axis=1)
            loss = loss_fn(target_Q_values,Q_values)
        gradient = tape.gradient(loss,self.QnetworkLocal.trainable_variables)
        optimizer.apply_gradients(zip(gradient,self.QnetworkLocal.trainable_variables))
        #errors = tf.math.abs(target_Q_values-Q_values)
        #self.ReplayBuffer.set_priorities(indices,errors)
        #self.importance = self.importance**(1-self.epsilon)
        
    def soft_update(self,eta= 0.001):
        target_weights = self.QnetworkTarget.get_weights()
        local_weights = self.QnetworkLocal.get_weights()
        update_weights = []
        for i in range(len(target_weights)):
            update_weights.append((1-eta)*target_weights[i]+eta*local_weights[i])
        self.QnetworkTarget.set_weights(update_weights)
        
        
            
        
        
        

0.001

In [9]:

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=1000,
    decay_rate=0.99)


optimizer = keras.optimizers.Adam(lr_schedule)
loss_fn = tf.keras.losses.Huber()


batch_size = 32
memory_size = 100000
action_size = env.action_space.n
state_size = env.observation_space.shape[0]


done = False 
history = []
avg_score = 0
epsilon_start = 1.0
n_episodes_1 = 500
n_episodes = 1500
epsilon_end = 0.03
decay = (epsilon_end/epsilon_start)**(1/n_episodes_1)

    

In [11]:
agent = DQNAgent(memory_size,action_size,state_size,optimizer,loss_fn)

In [12]:
epsilon = epsilon_start
max_avg_score = 0
decay_up = False
lr_step = 0
for ep in range(n_episodes):
    state = env.reset()
    score = 0
    done = False
    step =0

    if decay_up:
        epsilon = max(epsilon*decay,0.01)
    
    while not done:
        lr_step+=1
        action = agent.choose_action(state,epsilon)
        next_state,reward,done,info = env.step(action)
        score+=reward
        step+=1 
        exp = (state,action,reward,next_state,done)
        agent.remember(exp)
        state = next_state
        if ep>50: 
            decay_up = True
            agent.learn(batch_size)
            if lr_step%5==0:
                agent.soft_update()
   
    history.append(score)
    avg_score =  np.mean(history[-100:])
    if (ep>50) & (avg_score>=max_avg_score+10):
        max_avg_score = avg_score
        agent.QnetworkLocal.save('DuelingQnetwork_LunarLander_2.hf5')
        
        
    print('episode :', ep ,'score %.2f average_score %.2f epsilon %.2f lr %s '%(score,
                                                                         avg_score,
                                                                         epsilon,
                                                                         np.format_float_scientific(agent.optimizer.lr(lr_step).numpy())
))
        
        
        
    

episode : 0 score -255.66 average_score -255.66 epsilon 1.00 lr 9.990055e-04 
episode : 1 score -200.44 average_score -228.05 epsilon 1.00 lr 9.978315e-04 
episode : 2 score -127.60 average_score -194.57 epsilon 1.00 lr 9.967991e-04 
episode : 3 score -78.05 average_score -165.44 epsilon 1.00 lr 9.955576e-04 
episode : 4 score -263.42 average_score -185.03 epsilon 1.00 lr 9.945875e-04 
episode : 5 score -135.48 average_score -176.78 epsilon 1.00 lr 9.937482e-04 
episode : 6 score -246.39 average_score -186.72 epsilon 1.00 lr 9.929397e-04 
episode : 7 score -514.69 average_score -227.72 epsilon 1.00 lr 9.919222e-04 
episode : 8 score -138.59 average_score -217.81 epsilon 1.00 lr 9.910653e-04 
episode : 9 score -160.00 average_score -212.03 epsilon 1.00 lr 9.897115e-04 
episode : 10 score -195.14 average_score -210.50 epsilon 1.00 lr 9.888862e-04 
episode : 11 score -197.95 average_score -209.45 epsilon 1.00 lr 9.879426e-04 
episode : 12 score -173.12 average_score -206.66 epsilon 1.00 l

episode : 70 score -325.47 average_score -171.02 epsilon 0.88 lr 9.268623e-04 
episode : 71 score -99.67 average_score -170.03 epsilon 0.87 lr 9.2585676e-04 
episode : 72 score -110.50 average_score -169.22 epsilon 0.86 lr 9.2498254e-04 
episode : 73 score -160.30 average_score -169.09 epsilon 0.86 lr 9.242391e-04 
episode : 74 score -172.02 average_score -169.13 epsilon 0.85 lr 9.2331064e-04 
episode : 75 score -24.53 average_score -167.23 epsilon 0.85 lr 9.2257786e-04 
episode : 76 score -95.31 average_score -166.30 epsilon 0.84 lr 9.215678e-04 
episode : 77 score -122.27 average_score -165.73 epsilon 0.83 lr 9.206697e-04 
episode : 78 score -196.64 average_score -166.12 epsilon 0.83 lr 9.1972644e-04 
episode : 79 score -180.91 average_score -166.31 epsilon 0.82 lr 9.184886e-04 
episode : 80 score -128.94 average_score -165.85 epsilon 0.82 lr 9.175107e-04 
episode : 81 score -216.93 average_score -166.47 epsilon 0.81 lr 9.165614e-04 
episode : 82 score -178.53 average_score -166.62 e

episode : 174 score -77.88 average_score -93.35 epsilon 0.42 lr 7.732851e-04 
episode : 175 score -213.63 average_score -95.24 epsilon 0.42 lr 7.717789e-04 
episode : 176 score -37.86 average_score -94.66 epsilon 0.42 lr 7.7060075e-04 
episode : 177 score -96.27 average_score -94.40 epsilon 0.41 lr 7.6904567e-04 
episode : 178 score -27.16 average_score -92.71 epsilon 0.41 lr 7.680802e-04 
episode : 179 score -60.90 average_score -91.51 epsilon 0.41 lr 7.666611e-04 
episode : 180 score -61.29 average_score -90.83 epsilon 0.40 lr 7.649601e-04 
episode : 181 score -177.86 average_score -90.44 epsilon 0.40 lr 7.6343934e-04 
episode : 182 score -70.01 average_score -89.36 epsilon 0.40 lr 7.605751e-04 
episode : 183 score -8.45 average_score -88.37 epsilon 0.40 lr 7.5922324e-04 
episode : 184 score -53.53 average_score -87.91 epsilon 0.39 lr 7.564889e-04 
episode : 185 score -84.74 average_score -87.50 epsilon 0.39 lr 7.543631e-04 
episode : 186 score 12.23 average_score -86.56 epsilon 0.39

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: DuelingQnetwork_LunarLander_2.hf5/assets
episode : 272 score -41.15 average_score 10.18 epsilon 0.21 lr 4.0261453e-04 
episode : 273 score -141.86 average_score 8.17 epsilon 0.21 lr 3.9858843e-04 
episode : 274 score 114.77 average_score 10.10 epsilon 0.21 lr 3.9460254e-04 
episode : 275 score -144.05 average_score 10.80 epsilon 0.21 lr 3.9331967e-04 
episode : 276 score 39.28 average_score 11.57 epsilon 0.21 lr 3.893865e-04 
episode : 277 score 75.67 average_score 13.29 epsilon 0.20 lr 3.854926e-04 
episode : 278 score -193.59 average_score 11.62 epsilon 0.20 lr 3.8334835e-04 
episode : 279 score 105.32 average_score 13.29 epsilon 0.20 lr 3.7951485e-04 
episode : 280 score 84.47 average_score 14.74 epsilon 0.20 lr 3.7571974e-04 
episode : 281 score 228.79 average_score 18.81 epsilon 0.20 lr 3.7262109e-04 
Please report this to the TensorFlow team. When filing the bug, se

episode : 322 score -18.87 average_score 62.23 epsilon 0.15 lr 2.691133e-04 
episode : 323 score 58.84 average_score 61.39 epsilon 0.15 lr 2.6642217e-04 
episode : 324 score 134.30 average_score 62.37 epsilon 0.15 lr 2.6375795e-04 
episode : 325 score 226.96 average_score 64.42 epsilon 0.15 lr 2.6238835e-04 
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
INFO:tensorflow:Assets written to: DuelingQnetwork_LunarLander_2.hf5/assets
episode : 326 score 106.95 average_score 66.89 epsilon 0.15 lr 2.597645e-04 
episode : 327 score 224.07 average_score 68.04 epsilon 0.14 lr 2.5807563e-04 
episode : 328 score 13.96 average_s

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
INFO:tensorflow:Assets written to: DuelingQnetwork_LunarLander_2.hf5/assets
episode : 373 score 262.69 average_score 112.27 epsilon 0.10 lr 1.8510042e-04 
episode : 374 score 250.22 average_score 113.63 epsilon 0.10 lr 1.8430037e-04 
episode : 375 score 149.11 average_score 116.56 epsilon 0.10 lr 1.8245736e-04 
episode : 376 score 222.21 average_score 118.39 epsilon 0.10 lr 1.8082895e-04 
episode : 377 score 134.48 average_score 118.98 epsilon 0.10 lr 1.7929976e-04 
episode : 378 score -13.13 average_score 120.78 epsilon 0.10 lr 1.7912326e-04 
episode : 379 score 156.42 average_score 121.29 epsilon 0.10 lr 1.7733204e-04 
episode : 380 score 171.22 average_score 122.16 epsilon 0.10 lr 1.7567343e-04 
Please report this to the TensorFlow team. When filing the bug, set the verbosit

INFO:tensorflow:Assets written to: DuelingQnetwork_LunarLander_2.hf5/assets
episode : 411 score 269.95 average_score 167.56 epsilon 0.08 lr 1.4526724e-04 
episode : 412 score -125.62 average_score 164.07 epsilon 0.08 lr 1.4410395e-04 
episode : 413 score 240.38 average_score 164.55 epsilon 0.08 lr 1.4343782e-04 
episode : 414 score 285.01 average_score 167.10 epsilon 0.08 lr 1.4250672e-04 
episode : 415 score 46.26 average_score 164.76 epsilon 0.08 lr 1.4108163e-04 
episode : 416 score 256.33 average_score 166.11 epsilon 0.08 lr 1.4060884e-04 
episode : 417 score 253.80 average_score 167.39 epsilon 0.08 lr 1.3975787e-04 
episode : 418 score 248.20 average_score 167.91 epsilon 0.08 lr 1.3864844e-04 
episode : 419 score 266.02 average_score 168.24 epsilon 0.08 lr 1.3762111e-04 
episode : 420 score 247.79 average_score 168.49 epsilon 0.08 lr 1.3701663e-04 
episode : 421 score 240.44 average_score 169.05 epsilon 0.07 lr 1.3624763e-04 
episode : 422 score 287.73 average_score 172.12 epsilon

episode : 465 score -110.46 average_score 212.79 epsilon 0.05 lr 1.0984669e-04 
episode : 466 score 228.36 average_score 212.86 epsilon 0.05 lr 1.0937409e-04 
episode : 467 score 109.08 average_score 214.64 epsilon 0.05 lr 1.0828035e-04 
episode : 468 score 17.31 average_score 212.27 epsilon 0.05 lr 1.0719754e-04 
episode : 469 score -120.94 average_score 209.64 epsilon 0.05 lr 1.0692854e-04 
episode : 470 score 258.23 average_score 209.90 epsilon 0.05 lr 1.0650918e-04 
episode : 471 score 29.20 average_score 208.76 epsilon 0.05 lr 1.0625684e-04 
episode : 472 score 268.65 average_score 208.98 epsilon 0.05 lr 1.0578374e-04 
episode : 473 score 241.14 average_score 208.77 epsilon 0.05 lr 1.0527995e-04 
episode : 474 score 278.69 average_score 209.05 epsilon 0.05 lr 1.0502947e-04 
episode : 475 score 257.92 average_score 210.14 epsilon 0.05 lr 1.04384395e-04 
episode : 476 score 24.21 average_score 208.16 epsilon 0.05 lr 1.0334055e-04 
episode : 477 score 275.55 average_score 209.57 epsi

episode : 557 score 266.86 average_score 221.81 epsilon 0.03 lr 7.176263e-05 
episode : 558 score 253.05 average_score 221.74 epsilon 0.03 lr 7.1370625e-05 
episode : 559 score 274.02 average_score 224.03 epsilon 0.03 lr 7.108928e-05 
episode : 560 score 253.96 average_score 223.85 epsilon 0.03 lr 7.085746e-05 
episode : 561 score 252.29 average_score 223.64 epsilon 0.03 lr 7.059802e-05 
episode : 562 score 255.21 average_score 223.99 epsilon 0.03 lr 7.0299204e-05 
episode : 563 score 276.93 average_score 224.22 epsilon 0.03 lr 7.005306e-05 
episode : 564 score 248.15 average_score 224.11 epsilon 0.03 lr 6.981409e-05 
episode : 565 score 281.14 average_score 228.03 epsilon 0.03 lr 6.9638896e-05 
episode : 566 score 278.24 average_score 228.53 epsilon 0.03 lr 6.931142e-05 
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Pleas

episode : 624 score 236.41 average_score 255.30 epsilon 0.02 lr 5.5441116e-05 
episode : 625 score 261.61 average_score 255.37 epsilon 0.02 lr 5.5307555e-05 
episode : 626 score 266.59 average_score 255.22 epsilon 0.02 lr 5.4965098e-05 
episode : 627 score 262.85 average_score 255.03 epsilon 0.02 lr 5.4840955e-05 
episode : 628 score 301.59 average_score 255.43 epsilon 0.02 lr 5.468465e-05 
episode : 629 score 279.17 average_score 255.48 epsilon 0.02 lr 5.4541382e-05 
episode : 630 score 287.99 average_score 255.54 epsilon 0.02 lr 5.4405613e-05 
episode : 631 score 284.38 average_score 255.75 epsilon 0.02 lr 5.4187327e-05 
episode : 632 score 285.38 average_score 255.68 epsilon 0.02 lr 5.4006818e-05 
episode : 633 score 280.44 average_score 255.81 epsilon 0.02 lr 5.3837746e-05 
episode : 634 score 257.62 average_score 255.75 epsilon 0.02 lr 5.3691856e-05 
episode : 635 score 280.25 average_score 255.68 epsilon 0.02 lr 5.3530744e-05 
episode : 636 score 252.84 average_score 255.35 epsil

episode : 716 score 275.45 average_score 268.30 epsilon 0.01 lr 4.073093e-05 
episode : 717 score 249.77 average_score 268.05 epsilon 0.01 lr 4.0598097e-05 
episode : 718 score 266.81 average_score 267.93 epsilon 0.01 lr 4.0483617e-05 
episode : 719 score 281.94 average_score 267.72 epsilon 0.01 lr 4.0286348e-05 
episode : 720 score 275.29 average_score 267.50 epsilon 0.01 lr 4.0031275e-05 
episode : 721 score 260.92 average_score 267.29 epsilon 0.01 lr 3.990995e-05 
episode : 722 score 2.35 average_score 264.64 epsilon 0.01 lr 3.9847833e-05 
episode : 723 score 286.64 average_score 264.90 epsilon 0.01 lr 3.9733062e-05 
episode : 724 score 270.61 average_score 265.24 epsilon 0.01 lr 3.9640516e-05 
episode : 725 score 279.25 average_score 265.42 epsilon 0.01 lr 3.9534287e-05 
episode : 726 score 287.84 average_score 265.63 epsilon 0.01 lr 3.936064e-05 
episode : 727 score 277.82 average_score 265.78 epsilon 0.01 lr 3.921849e-05 
episode : 728 score 268.00 average_score 265.45 epsilon 0.

episode : 821 score 288.44 average_score 265.49 epsilon 0.01 lr 2.7585113e-05 
episode : 822 score 253.66 average_score 268.00 epsilon 0.01 lr 2.7516719e-05 
episode : 823 score 265.83 average_score 267.79 epsilon 0.01 lr 2.7445183e-05 
episode : 824 score 267.49 average_score 267.76 epsilon 0.01 lr 2.7376043e-05 
episode : 825 score 294.03 average_score 267.91 epsilon 0.01 lr 2.7282105e-05 
episode : 826 score 25.46 average_score 265.29 epsilon 0.01 lr 2.7237993e-05 
episode : 827 score 241.14 average_score 264.92 epsilon 0.01 lr 2.7021504e-05 
episode : 828 score 249.32 average_score 264.73 epsilon 0.01 lr 2.6965348e-05 
episode : 829 score 287.30 average_score 264.61 epsilon 0.01 lr 2.6909034e-05 
episode : 830 score 286.74 average_score 264.74 epsilon 0.01 lr 2.6836653e-05 
episode : 831 score 285.20 average_score 264.74 epsilon 0.01 lr 2.6794607e-05 
episode : 832 score 289.67 average_score 265.11 epsilon 0.01 lr 2.6695954e-05 
episode : 833 score 174.98 average_score 264.18 epsil

episode : 925 score 267.57 average_score 263.39 epsilon 0.01 lr 2.0187894e-05 
episode : 926 score 252.96 average_score 265.67 epsilon 0.01 lr 2.0149782e-05 
episode : 927 score 257.48 average_score 265.83 epsilon 0.01 lr 2.0091951e-05 
episode : 928 score 271.62 average_score 266.05 epsilon 0.01 lr 2.0049185e-05 
episode : 929 score 281.87 average_score 266.00 epsilon 0.01 lr 1.9999274e-05 
episode : 930 score 275.17 average_score 265.88 epsilon 0.01 lr 1.9945477e-05 
episode : 931 score 291.26 average_score 265.94 epsilon 0.01 lr 1.9884834e-05 
episode : 932 score 254.07 average_score 265.59 epsilon 0.01 lr 1.98463e-05 
episode : 933 score 283.98 average_score 266.68 epsilon 0.01 lr 1.9797095e-05 
episode : 934 score 273.96 average_score 266.81 epsilon 0.01 lr 1.9751382e-05 
episode : 935 score 283.72 average_score 266.75 epsilon 0.01 lr 1.9718853e-05 
episode : 936 score 297.73 average_score 267.00 epsilon 0.01 lr 1.9659294e-05 
episode : 937 score 281.10 average_score 267.05 epsilo

episode : 1017 score 287.90 average_score 273.50 epsilon 0.01 lr 1.591996e-05 
episode : 1018 score 264.97 average_score 273.24 epsilon 0.01 lr 1.5886555e-05 
episode : 1019 score 292.56 average_score 273.55 epsilon 0.01 lr 1.5845733e-05 
episode : 1020 score 253.05 average_score 273.35 epsilon 0.01 lr 1.5811689e-05 
episode : 1021 score 275.95 average_score 273.42 epsilon 0.01 lr 1.5776772e-05 
episode : 1022 score 283.58 average_score 273.28 epsilon 0.01 lr 1.5724843e-05 
episode : 1023 score 272.24 average_score 273.13 epsilon 0.01 lr 1.5669628e-05 
episode : 1024 score 272.67 average_score 273.24 epsilon 0.01 lr 1.5635806e-05 
episode : 1025 score 293.74 average_score 273.50 epsilon 0.01 lr 1.559453e-05 
episode : 1026 score 295.28 average_score 273.92 epsilon 0.01 lr 1.554102e-05 
episode : 1027 score 276.84 average_score 274.12 epsilon 0.01 lr 1.5514022e-05 
episode : 1028 score 276.12 average_score 274.16 epsilon 0.01 lr 1.5477739e-05 
episode : 1029 score 136.90 average_score 2

episode : 1120 score 300.52 average_score 267.65 epsilon 0.01 lr 1.2115871e-05 
episode : 1121 score 262.15 average_score 267.51 epsilon 0.01 lr 1.2095798e-05 
episode : 1122 score 286.10 average_score 267.54 epsilon 0.01 lr 1.2071022e-05 
episode : 1123 score 275.91 average_score 267.58 epsilon 0.01 lr 1.2046421e-05 
episode : 1124 score -4.98 average_score 264.80 epsilon 0.01 lr 1.2033594e-05 
episode : 1125 score 281.26 average_score 264.67 epsilon 0.01 lr 1.1997365e-05 
episode : 1126 score 299.69 average_score 264.72 epsilon 0.01 lr 1.1964733e-05 
episode : 1127 score 280.55 average_score 264.76 epsilon 0.01 lr 1.19308725e-05 
episode : 1128 score 285.33 average_score 264.85 epsilon 0.01 lr 1.19056e-05 
episode : 1129 score 269.26 average_score 266.17 epsilon 0.01 lr 1.1882887e-05 
episode : 1130 score 276.71 average_score 266.14 epsilon 0.01 lr 1.1856401e-05 
episode : 1131 score 289.50 average_score 266.20 epsilon 0.01 lr 1.1818925e-05 
episode : 1132 score 294.37 average_score 

episode : 1223 score 275.41 average_score 264.72 epsilon 0.01 lr 9.330873e-06 
episode : 1224 score 279.23 average_score 267.56 epsilon 0.01 lr 9.303436e-06 
episode : 1225 score 273.23 average_score 267.48 epsilon 0.01 lr 9.288394e-06 
episode : 1226 score 267.45 average_score 267.16 epsilon 0.01 lr 9.267135e-06 
episode : 1227 score 279.94 average_score 267.15 epsilon 0.01 lr 9.247131e-06 
episode : 1228 score 54.10 average_score 264.84 epsilon 0.01 lr 9.224668e-06 
episode : 1229 score 273.25 average_score 264.88 epsilon 0.01 lr 9.2021655e-06 
episode : 1230 score 275.41 average_score 264.86 epsilon 0.01 lr 9.176768e-06 
episode : 1231 score 282.70 average_score 264.80 epsilon 0.01 lr 9.162759e-06 
episode : 1232 score 258.91 average_score 264.44 epsilon 0.01 lr 9.1458305e-06 
episode : 1233 score 252.93 average_score 264.32 epsilon 0.01 lr 9.116282e-06 
episode : 1234 score 284.93 average_score 264.51 epsilon 0.01 lr 9.09048e-06 
episode : 1235 score 297.01 average_score 267.68 eps

episode : 1327 score 310.18 average_score 262.24 epsilon 0.01 lr 7.0435353e-06 
episode : 1328 score 263.13 average_score 264.33 epsilon 0.01 lr 7.0246597e-06 
episode : 1329 score 247.68 average_score 264.07 epsilon 0.01 lr 7.008442e-06 
episode : 1330 score 280.61 average_score 264.13 epsilon 0.01 lr 6.987412e-06 
episode : 1331 score 236.95 average_score 263.67 epsilon 0.01 lr 6.937798e-06 
episode : 1332 score 270.61 average_score 263.78 epsilon 0.01 lr 6.926931e-06 
episode : 1333 score 302.62 average_score 264.28 epsilon 0.01 lr 6.9126727e-06 
episode : 1334 score 270.57 average_score 264.14 epsilon 0.01 lr 6.9000394e-06 
episode : 1335 score 272.96 average_score 263.90 epsilon 0.01 lr 6.886599e-06 
episode : 1336 score 18.23 average_score 261.14 epsilon 0.01 lr 6.8787144e-06 
episode : 1337 score 253.62 average_score 260.80 epsilon 0.01 lr 6.8637282e-06 
episode : 1338 score 292.34 average_score 260.78 epsilon 0.01 lr 6.8486383e-06 
episode : 1339 score 266.53 average_score 260.

episode : 1431 score 235.31 average_score 263.42 epsilon 0.01 lr 5.4612424e-06 
episode : 1432 score 287.54 average_score 263.59 epsilon 0.01 lr 5.4486313e-06 
episode : 1433 score 308.03 average_score 263.65 epsilon 0.01 lr 5.4331576e-06 
episode : 1434 score 280.42 average_score 263.75 epsilon 0.01 lr 5.422793e-06 
episode : 1435 score 276.27 average_score 263.78 epsilon 0.01 lr 5.412285e-06 
episode : 1436 score 280.85 average_score 266.40 epsilon 0.01 lr 5.3991366e-06 
episode : 1437 score 277.74 average_score 266.65 epsilon 0.01 lr 5.3886183e-06 
episode : 1438 score 285.79 average_score 266.58 epsilon 0.01 lr 5.3783942e-06 
episode : 1439 score 253.03 average_score 266.45 epsilon 0.01 lr 5.363335e-06 
episode : 1440 score 290.01 average_score 266.91 epsilon 0.01 lr 5.351434e-06 
episode : 1441 score 284.68 average_score 267.13 epsilon 0.01 lr 5.3393455e-06 
episode : 1442 score 311.96 average_score 267.31 epsilon 0.01 lr 5.32793e-06 
episode : 1443 score 277.94 average_score 267.

In [69]:
agent.optimizer.lr()

<tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay at 0x7fc341be5b50>

In [17]:
agent.QnetworkLocal.save('DuelingQnetwork_LunarL.hf5')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: DuelingQnetwork_LunarL.hf5/assets


In [20]:
agent.QnetworkLocal = keras.models.load_model('DuelingQnetwork_LunarLander_2.hf5')


In [28]:
state = env.reset()
env.render('human')
n = 200
score =0
done = False
import time
time.sleep(2)
while not done:
    action = agent.choose_action(state,epsilon = 0.0)
    state,reward,done,_ = env.step(action)
    
    score+=reward
    env.render('human')
print(score)

292.8772222354757


In [25]:
dueling q network keras#model.save('MountainCar_V0_18_12.hf5')

In [29]:
env.close()