In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym # 강화학습 환경 제공
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

In [3]:
env = gym.make('CartPole-v0')

### The Policy-Based Agent

In [12]:
gamma = 0.99

def discount_rewards(r):
    # 보상의 1D 실수 배열을 취해서 discounted reward 계산
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [5]:
 np.zeros_like([1, 2])

array([0, 0])

In [8]:
list(reversed(xrange(0, 3)))

[2, 1, 0]

In [13]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        # 네트워크의 피드포워드 부분
        # 에이전트는 상태를 받아서 액션을 출력함
        self.state_in= tf.placeholder(shape=[None,s_size],
                                      dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,
                                      h_size,
                                      biases_initializer=None,
                                      activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,
                                           a_size,
                                           activation_fn=tf.nn.softmax,
                                           biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)
        
        # 학습 과정 구현
        # 비용을 계산하기 위해 보상과 액션을 네트워크에 피드하고, 
        # 네트워크를 업데이트하는 데에 이를 이용함
        self.reward_holder = tf.placeholder(shape=[None],
                                            dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],
                                            dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

## 에이전트 학습시키기

In [15]:
tf.reset_default_graph() #Clear the Tensorflow graph.

# 에이전트 로드
# s_size: 상태 개수, a_size: 액션 개수, h_size: hidden_dim
myAgent = agent(lr=1e-2, s_size=4, a_size=2, h_size=8) #Load the agent.

# 에이전트를 학습시킬 총 에피소드 수 설정
total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    
    # 버퍼 만들기
    # 에이전트가 한 순간에 하나 이상의 경험을 가지도록 업데이트한다. 
    # 경험을 버퍼에 모아뒀다가 한번에 에이전트를 업데이트함
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset() # CartPole
        running_reward = 0
        ep_history = [] # expierence trace (=rollout)
        for j in range(max_ep):
            # 네트워크 출력에서 확률적으로 액션을 선택한다. 
            a_dist = sess.run(myAgent.output,
                              feed_dict={myAgent.state_in:[s]})
            a = np.random.choice(a_dist[0],
                                 p=a_dist[0])
            a = np.argmax(a_dist == a)
            
            # 주어진 밴딧에 대해 액션을 취한 데 보상을 얻는다. 
            s1, r, d,_ = env.step(a) # d: done
            ep_history.append([s,a,r,s1])
            s = s1
            running_reward += r
            
            if d == True:
                # 네트워크를 업데이트한다. 
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],
                           myAgent.state_in:np.vstack(ep_history[:,0])}
                grads = sess.run(myAgent.gradients, 
                                 feed_dict=feed_dict)
                
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict = dictionary = dict(zip(myAgent.gradient_holders, 
                                                      gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    
                    # 버퍼 초기화
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_length.append(j)
                break

        
            #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:])) # 리워드 평균
        i += 1



10.0
19.71
19.81
21.21
31.55
37.02
42.94
52.32
60.03
77.8
129.52
126.47
141.46
159.94
162.27
156.02
166.56
158.58
175.32
184.21
182.81
177.68
182.8
188.09
193.94
195.32
193.91
194.41
195.26
197.42
196.27
196.65
192.8
196.27
199.0
196.32
198.62
194.43
197.11
195.74
192.47
195.86
193.97
180.77
191.38
196.02
195.97
192.96
187.15
194.93
