In [1]:
from collections import deque
import random
import gym
import numpy as np
from tensorflow.keras import models, layers, optimizers
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class DQN(object):
    def __init__(self, num_states, num_actions, update_freq, replay_size, optim_lr, epsilon):
        self.train_steps = 0
        self.num_states = num_states
        self.num_actions = num_actions
        self.update_freq = update_freq
        self.replay_size = replay_size
        self.optim_lr = optim_lr
        self.epsilon = epsilon
        
        self.replay_queue = deque(maxlen=self.replay_size)

        self.primary_model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        hidden_units = 30
        model = models.Sequential([
            layers.Dense(hidden_units, 
                         input_dim = self.num_states, 
                         activation='relu', 
                         kernel_initializer='glorot_normal', 
                         bias_initializer='glorot_normal'),
            layers.Dense(self.num_actions, 
                         activation="linear",
                         kernel_initializer='glorot_normal',
                         bias_initializer='glorot_normal'
                        )
        ])
        
        model.compile(loss='mean_squared_error',
                      optimizer=optimizers.Adam(self.optim_lr))
        return model
    
    def epsilon_greedy(self, obs):
        if np.random.uniform(low=0, high=1) < self.epsilon:
            action =  np.random.choice(self.num_actions)
        else:
            action = np.argmax(self.primary_model.predict(np.array([obs]))[0])
        return action
    
    def store_experience(self, obs, action, obs_next, reward):
        #reward optimize
        reward = -10 + abs(obs[0] + 0.5) + 4 * max(obs[0]-0.1,0) + 0.2 * obs[1]
        self.replay_queue.append((obs, action, obs_next, reward))

    def train(self, batch_size, alpha, gamma):
        if len(self.replay_queue) < self.replay_size:
            return
        self.train_steps += 1

        if self.epsilon > 0.1:
            self.epsilon -= 0.0003

        if self.train_steps % self.update_freq == 0:
            self.target_model.set_weights(self.primary_model.get_weights())

        replay_batch = random.sample(self.replay_queue, batch_size)
        
        obs_batch = np.array([replay[0] for replay in replay_batch])
        obs_next_batch = np.array([replay[2] for replay in replay_batch])

        Q = self.primary_model.predict(obs_batch)
        Q_next = self.target_model.predict(obs_next_batch)

        for obs, replay in enumerate(replay_batch):
            _, act, _, reward = replay
            Q[obs][act] = (1 - alpha) * Q[obs][act] + alpha * (reward + gamma * np.amax(Q_next[obs]))

        self.primary_model.fit(obs_batch, Q, verbose=0)

In [3]:
env = gym.make('MountainCar-v0')
env = env.unwrapped
env = gym.wrappers.Monitor(env,'MountainCar-v0-DQN',force=True)

num_episodes = 100
initial_alpha = 1.0
min_alpha = 0.001

In [4]:
dqn = DQN(
    num_states = env.observation_space.shape[0], 
    num_actions = env.action_space.n,
    update_freq = 200, 
    replay_size = 2000,
    optim_lr = 0.001,
    epsilon = 0.15)

for episode in range(num_episodes):
    obs = env.reset()
    steps = 0
    
    alpha = max(min_alpha, initial_alpha * (0.8 ** (episode // 5)))
    print('----------')
    print("alpha = " + str(alpha))
    while True:
        env.render()
        action = dqn.epsilon_greedy(obs) 
        obs_next, reward, terminate, _ = env.step(action)
        dqn.store_experience(obs, action, obs_next, reward)
        dqn.train(
            batch_size=64, 
            alpha=alpha, 
            gamma=0.99)
        steps += 1
        if terminate:
            break
        obs = obs_next
    print("Episode {} completed in {} steps".format(episode + 1, steps))

start = time.time()
while True: 
    env.render()
    if (time.time()-start)>=5:
        break
env.close()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
----------
alpha = 1.0
Instructions for updating:
Use tf.cast instead.
Episode 1 completed in 3046 steps
----------
alpha = 1.0
Episode 2 completed in 1593 steps
----------
alpha = 1.0
Episode 3 completed in 3790 steps
----------
alpha = 1.0
Episode 4 completed in 272 steps
----------
alpha = 1.0
Episode 5 completed in 362 steps
----------
alpha = 0.8
Episode 6 completed in 731 steps
----------
alpha = 0.8
Episode 7 completed in 491 steps
----------
alpha = 0.8
Episode 8 completed in 2002 steps
----------
alpha = 0.8
Episode 9 completed in 977 steps
----------
alpha = 0.8
Episode 10 completed in 11812 steps
----------
alpha = 0.6400000000000001
Episode 11 completed in 1001 steps
----------
alpha = 0.6400000000000001
Episode 12 completed in 5947 steps
----------
alpha = 0.6400000000000001
Episode 13 completed in 1332 steps
----------
alpha = 0.6400000000000001
Episode 