In [2]:
import gym
import random
import tensorflow as tf
from collections import deque

from matplotlib import pyplot as plt

seed = 0
np.random.seed(seed)
random.seed(seed)

### Helper Function

In [4]:
# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display


def display_frames_as_gif(frames):
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=5)
    display(display_animation(anim, default_mode='loop'))

### Deep Q Learning Agent

In [11]:
class DQNAgent:
    def __init__(self, obs_dim, n_action, seed=0,
                 discount_factor = 0.995, epsilon_decay = 0.999, epsilon_min = 0.01,
                 learning_rate = 1e-3,
                 batch_size = 64,
                 memory_size = 2000, hidden_unit_size = 64):
        
        self.seed = seed
        
        # Environment information
        self.obs_dim = obs_dim
        self.n_action = n_action
        self.discount_factor = discount_factor
        
        # Epsilon Greedy Policy
        self.epsilon = 1.0
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        # Network Hyperparameters
        self.hidden_unit_size = hidden_unit_size
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.train_start = 1000 # ?
        
        # Experience Replay
        self.memory = deque(maxlen=memory_size)
        
        # Define Conputational Graph in Tensorflow
        self.g = tf.Graph()
        with self.g.as_default():
            self.build_placeholders()
            self.build_model()
            self.build_loss()
            self.build_update_operation()
            self.init_session() # Initialize all parameters in graph
        
        def build_placeholders(self):
            # input state
            self.obs_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.obs_dim], name='obs')
            # TD target
            self.target_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.n_action], name='target')
            self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='lr')
        
        def build_model(self):
            # build networks
            hid1_size = self.hidden_unit_size
            hid2_size = self.hidden_unit_size
            
            with tf.variable_scope(name_or_scope='q_prediction'): # prediction network
                # action의 갯수만큼 출력이 나와야한다. (output dimension)
                out = tf.layers.dense(inputs=self.obs_ph, units=hid1_size, activation=tf.tanh,
                                      kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='hidden1')
                out = tf.layers.dense(inputs=out, units=hid2_size, activation=tf.tanh,
                                      kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='hidden2')
                self.q_predict = tf.layers.dense(inputs=out, units=self.n_action,
                                                     kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='q_predict')
            
            with tf.variable_scope(name_or_scope='q_target'): # target network
                # action의 갯수만큼 출력이 나와야한다. (output dimension)
                out = tf.layers.dense(inputs=self.obs_ph, units=hid1_size, activation=tf.tanh,
                                      kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='hidden1')
                out = tf.layers.dense(inputs=out, units=hid2_size, activation=tf.tanh,
                                      kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='hidden2')
                self.q_predict_old = tf.layers.dense(inputs=out, units=self.n_action,
                                                     kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='q_predict')
            
            # weights 출력 : scope를 설정해놓았기 때문에 해당 scope의 weight를 한꺼번에 출력 가능
            # 먼저, prediction network의 parameter를 가져오자
            self.weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_prediction')
            # 다음으로 target network의 parameter를 가져오자
            self.weights_old = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_target')
            

        def build_loss(self):
            self.loss = 0.5*tf.reduce_sum(tf.square(self.target_ph - self.q_predict)) # TD target값은 q_predict_old를 가지고 계산해서 나와야하는거 아닌가?
            self.optim = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph).minimize(self.loss)
        
        def build_update_operation(self):
            # Define parameter update operation in TF graph
            update_ops = []
            # target network의 parameter를 prediction network의 parameter로 업데이트한다.
            for var, var_old in zip(self.weights, self.weights_old):
                update_ops.append(var_old.assign(var)) # parameter를 덮어씌운다(assgign).
            self.update_ops = update_ops
            
        def init_session(self):
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True # gpu쓰기위한 코드??
            self.sess = tf.Session(config=config, graph=self.g) # session 초기화
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.update_ops) # 얘는 feed_dict없어도 되나??
            
            # summary writer
            summary_q = tf.summary.scalar(name='max_Q_predict', tensor=tf.reduce_max(self.q_predict))
            summary_q_old = tf.summary.scalar(name='max_Q_target', tensor=tf.reduce_max(self.q_predict_old))
            summary_loss = tf.summary.scalar(name='loss', tensor=self.loss)
            self.merge_q_step = 0
            self.merge_q = tf.summary.merge([summary_q, summary_q_old])
            self.merge_loss_step = 0
            self.merge_loss = tf.summary.merge([summary_loss])
            self.summary_writer = tf.summary.FileWriter('./tf_logs/dqn', graph=self.sess.graph)
            
        
        def update_target(self):
            # update parameters
            self.sess.run(self.update_ops)
            
        def update_policy(self):
            # update epsilon
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
        
        def get_prediction_old(self, obs):
            # target network에서 Q value를 가져온다.
            q_value_old, summary = self.sess.run([self.q_predict_old, self.merge_q], feed_dict={self.obs_ph:obs})
            
            # summary Q value
            self.merge_q_step += 1
            self.summary_writer.add_summary(summary, self.merge_q_step)
            
            return q_value_old
        
        def get_prediction(self, obs):
            # prediction network에서 Q value를 가져온다.
            q_value, summary = self.sess.run([self.q_predict, self.merge_q], feed_dict={self.obs_ph:obs})
            
            # summary Q value
            self.merge_q_step += 1
            self.summary_writer.add_summary(summary, self.merge_q_step)
            
            return q_value
        
        def get_action(self, obs):
            # epsilon greedy policy
            if np.random.rand() <= self.epsilon:
                return random.randrange(self.n_action)
            else:
                q_value = self.get_prediction([obs])
                return np.argmax(q_value[0]) # 0 indexing은 뭔가?
            
        def add_experience(self, obs, action, reward, next_obs, done):
            # memory에 experience sample 넣기
            self.memory.append((obs,action,reward, next_obs, done))
            
        def train_model(self):
            loss = np.nan
            n_entries = len(self.memory)
            
            # experience 갯수가 정해놓은 수보다 커지면 training을 시작한다.
            if n_entries > self.train_start:
                # random batch sampling
                mini_batch = random.sample(self.memory, self.batch_size)
                
                observations = np.zeros((self.batch_size, self.obs_dim))
                next_observations = np.zeros((self.batch_size, self.obs_dim))
                actions, rewards, dones = [], [], []
                
                for i in range(self.batch_size):
                    observations[i] = mini_batch[i][0] #obs를 넣어준다.
                    actions.append(mini_batch[i][1])
                    rewards.append(mini_batch[i][2])
                    next_observations[i] = mini_batch[i][3]
                    dones.append(mini_batch[i][4])
                
                target = self.get_prediction(observation) # get_prediction_old가 아니고?
                next_q_value = self.get_prediction_old(next_observations)
                
                # bellman update rule
                for i in range(self.batch_size):
                    if dones[i]:
                        target[i][actions[i]] = rewards[i]
                    else:
                        target[i][action[i]] = rewards[i] + self.discount_factor * (np.max(next_q_value[i]))
                    
                loss, _, summary = self.sess.run([self.loss, self.optim, self.merge_loss],
                                                feed_dict={self.obs_ph:observations, self.target_ph:target, self.learning_rate_ph:self.learning_rate})
                
                # summary loss
                self.merge_loss_step += 1
                self.summary_writer.add_summary(summary, self.merge_loss_step)
            return loss
                