In [1]:
import gym
import os
import time
import numpy as np
import tensorflow as tf
from LinearEpsilonExplorer import LinearEpsilonExplorer
from ReplayMemory import ReplayMemory

  from ._conv import register_converters as _register_converters


In [2]:
class DQN_Agent:
    
    def __init__(self, 
                 sess, 
                 input_shape, 
                 action_num,
                 lr=2.5e-4,
                 gamma=0.99,
                 explorer=LinearEpsilonExplorer(1, 0.05, 1e5),
                 minibatch=32,
                 memory_size=5e5,
                 target_update_interval=1e4,
                 train_after=1e4):
        
        self.sess = sess
        self.explorer = explorer
        self.minibatch = minibatch
        self.target_update_interval = target_update_interval
        self.train_after = train_after
        self.gamma = gamma
        self.input_shape = list(input_shape)
        self.action_num = action_num
        
        self.replay_memory = ReplayMemory(memory_size)
        self.num_action_taken = 0
        
        self.X_Q = tf.placeholder(tf.float32, [None] + self.input_shape)
        self.X_t = tf.placeholder(tf.float32, [None] + self.input_shape)
        self.Q_network = self._build_network("Q_network", self.X_Q)
        self.target_network = self._build_network("target_network", self.X_t)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        
        with tf.variable_scope("optimizer"):
            self.actions = tf.placeholder(tf.int32, [None], name="actions")
            # Q estimate
            actions_one_hot = tf.one_hot(self.actions, self.action_num)
            Q_pred = tf.reduce_sum(tf.multiply(self.Q_network, actions_one_hot), axis=1)
            # td_target
            self.td_target = tf.placeholder(tf.float32, [None])
            # loss
            self.loss = tf.losses.huber_loss(self.td_target, Q_pred)
            self.train_step = self.optimizer.minimize(self.loss)
            
        self.eval_param = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="Q_network")
        self.target_param = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_network")
    
    def _build_network(self, scope_name, X):
        with tf.variable_scope(scope_name):
            fc1 = tf.layers.dense(X, 128, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 256, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu)
            fc4 = tf.layers.dense(fc3, 512, activation=tf.nn.relu)
            out = tf.layers.dense(fc4, self.action_num)
        return out
    
    def act(self, observation, is_training=True):
        if is_training:
            # choose action given state
            # follow a linearly decay epsilon greedy policy
            if self.num_action_taken >= self.train_after:
                if self.explorer.explore(self.num_action_taken - self.train_after):
                    action = self.explorer.choose_random_action(self.action_num)
                else:
                    state = np.reshape(observation, [1] + self.input_shape)
                    Q_values = self.sess.run(self.Q_network, feed_dict={self.X_Q : state})
                    action = np.argmax(Q_values[0])
            else:
                action = self.explorer.choose_random_action(self.action_num)
            self.num_action_taken += 1
        else:
            state = np.reshape(observation, [1] + self.input_shape)
            Q_values = self.sess.run(self.Q_network, feed_dict={self.X_Q : state})
            action = np.argmax(Q_values[0])
        return action
    
    def observe(self, pre_state, action, reward, post_state, done):
        # store transition in replay memory
        self.replay_memory.append(pre_state, action, reward, post_state, done)
        
    def train(self):
        loss = 0
        
        if self.num_action_taken >= self.train_after:
            # retrieve data
            pre_states, actions, rewards, post_states, dones = self.replay_memory.sample(self.minibatch)
            
            # Double DQN uses Q_network to choose action for post state
            # and then use target network to evaluate that policy
            Q_eval = self.sess.run(self.Q_network, feed_dict={self.X_Q:post_states})
            best_action = np.argmax(Q_eval, axis=1)
            
            # create one hot representation for action
            best_action_oh = np.zeros((best_action.size, self.action_num))
            best_action_oh[np.arange(best_action.size), best_action] = 1
            
            # evaluate through target_network
            Q_target = self.sess.run(self.target_network, feed_dict={self.X_t:post_states}) * best_action_oh
            Q_target = np.sum(Q_target, axis=1)
            
            y_batch = rewards + self.gamma * Q_target * (1 - dones)
            _, loss = self.sess.run([self.train_step, self.loss], feed_dict={self.X_Q:pre_states, self.actions:actions, self.td_target:y_batch})
        
            if self.num_action_taken % self.target_update_interval == 0:
                self._update_target_net()
        
        return loss
    
    def _update_target_net(self):
        ops = [tf.assign(dest_var, src_var) for dest_var, src_var in zip(self.target_param, self.eval_param)]
        sess.run(ops)

In [3]:
tf.reset_default_graph()

with tf.device("/gpu:0"):
    
    config = tf.ConfigProto(allow_soft_placement = True)
    with tf.Session(config=config) as sess:
        
        env = gym.make("LunarLander-v2")
        state = env.reset()
        action_num = env.action_space.n
        input_shape = env.observation_space.shape
        agent = DQN_Agent(sess, input_shape, action_num)
        
        episodic_reward = tf.get_variable("episodic_reward", (), trainable=False)
        episodic_step = tf.get_variable("episodic_step", (), trainable=False)
        tf.summary.scalar("episode_reward",episodic_reward)
        tf.summary.scalar("episode_step",episodic_step)
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter("logs/ddqn", sess.graph)
        saver = tf.train.Saver(max_to_keep=20)
        
        sess.run(tf.global_variables_initializer())
        
        e, episode_reward, global_step, episode_step = 0, 0, 0, 0
        
        while True:
            
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            # normalize reward
            reward /= 200
            episode_reward += reward
            agent.observe(state, action, reward, next_state, done)
            agent.train()
            state = next_state
            episode_step += 1
            global_step += 1
            
            if done:
                state = env.reset()
                summary = sess.run(merged, feed_dict={episodic_reward:episode_reward, episodic_step:episode_step})
                writer.add_summary(summary, global_step=e)
                if e % 10 == 0:
                    writer.flush()
                episode_reward = 0
                episode_step = 0
                e += 1
            
                if e % 50 == 0:
                    print(e, global_step)
                    saver.save(sess, "ddqn/model", global_step=e)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
50 4499
100 8872
150 13360
200 18179
250 23351
300 28490
350 34140
400 39770
450 45864
500 52410
550 60204
600 72462
650 105651
700 150309
750 193941
800 231374
850 265519
900 290364
950 319597
1000 354403
1050 391884
1100 427483
1150 456554
1200 488974
1250 523734
1300 548125
1350 575688
1400 602253
1450 627921
1500 647029
1550 666037
1600 695968
1650 728555
1700 757530
1750 786401
1800 809328
1850 828460
1900 849906
1950 869812
2000 888479
2050 903532
2100 922867
2150 942524
2200 960746
2250 979074
2300 996294
2350 1012356
2400 1031387
2450 1048007
2500 1067017
2550 1082376
2600 1102156
2650 1124151
2700 1144389
2750 1162164
2800 1178840
2850 1196103
2900 1216091
2950 1233550
3000 1248177
3050 1260898
3100 1276953
3150 1291564
3200 1306245
3250 1320922
3300 1337251
3350 1352553
3400 1366850
3450 1383531
3500 1398451
3550 1413794
3600 1426756
3650 1441098
3700 1455501
3750 14682

KeyboardInterrupt: 

In [4]:
tf.reset_default_graph()

with tf.device("/gpu:0"):
    
    config = tf.ConfigProto(allow_soft_placement = True)
    with tf.Session(config=config) as sess:
        
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./ddqn/checkpoint'))
        env = gym.make("LunarLander-v2")
        state = env.reset()
        action_num = env.action_space.n
        input_shape = env.observation_space.shape
        agent = DQN_Agent(sess, input_shape, action_num, lr=0.001)
        saver = tf.train.Saver()
        saver.restore(sess, ckpt.model_checkpoint_path)
        done = False
        
        for i in range(10):
            l = []
            while not done:
                env.render()
                time.sleep(0.01)
                action = agent.act(state, False)
                state, reward, done, info = env.step(action)
                l.append(reward)
                #print(reward)
                if done:
                    state = env.reset()
                    done = False
                    print(sum(l)/len(l))
                    break
        env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from ./ddqn/model-7350
1.3457919286027542
1.4343305253032543
1.2767006519789745
0.7952423384407983
0.8958160346709608
1.2978566474052975
1.5931179249314191
0.9855039762055069
0.33648068531533604
0.9404445155151422


In [5]:
reward

100

In [None]:
1- data[-1]