# bandit problem

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

  from ._conv import register_converters as _register_converters


In [35]:
# 밴딧의 손잡이 목록 작성
bandit_arms = [0.2, 0, -1.2, 0.3, -1.0]
num_arms = len(bandit_arms)

def pullBandit(bandit):
    result = np.random.randn(1)
    if result > bandit:
        # 양의 보상 반환
        return 1
    else:
        # 음의 보상 반환
        return -1

In [32]:
# 에이전트 구현
tf.reset_default_graph()

# 네트워크의 피드포워드 부분을 구현한다.
weights = tf.Variable(tf.ones([num_arms]))
output = tf.nn.softmax(weights)

# 학습과정 구현
# 보상과 선택된 액션을 네트워크에 피드해줌으로써 비용을 계산하고
# 비용을 이용해 네트워크를 업데이트
reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)

responsible_output = tf.slice(output, action_holder, [1])
loss =-(tf.log(responsible_output)*reward_holder)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

In [36]:
# 에이전트 학습
# 에이전트를 학습시킬 총 에피소드의 수
total_episodes = 5000
# 밴딧 손잡이에 대한 점수판을 0으로 설정
total_reward = np.zeros(num_arms)

init = tf.global_variables_initializer()

# 텐서플로우 그래프 생성
with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episodes):
        # 볼츠만 분포에 따라 액션 선택
        actions = sess.run(output)
        pick = np.random.choice(actions, p=actions)
        action = np.argmax(actions == pick)
        
        # 밴딧 손잡이 중 하나를 선택함으로써 보상을 받는다.
        reward = pullBandit(bandit_arms[action])
        
        # 네트워크 업데이트
        _, resp, ww = sess.run([update, responsible_output, weights], feed_dict={reward_holder:[reward], action_holder:[action]})
        
        # 보상의 총계 업데이트
        total_reward[action] += reward
        if i % 200 == 0:
            print(str(i) + " Running reward for the " + str(num_arms) + " arms of the bandit: " + str(total_reward))

print("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising...")
if np.argmax(ww) == np.argmax(-np.array(bandit_arms)):
    print("It was right!")
else:
    print("Sorry, It was wrong!")

0 Running reward for the 5 arms of the bandit: [-1.  0.  0.  0.  0.]
200 Running reward for the 5 arms of the bandit: [-15.  -3.  35.  -8.  36.]
400 Running reward for the 5 arms of the bandit: [-11.  -1.  71. -11.  65.]
600 Running reward for the 5 arms of the bandit: [-13.   3. 119. -22.  94.]
800 Running reward for the 5 arms of the bandit: [-19.   4. 158. -38. 130.]
1000 Running reward for the 5 arms of the bandit: [-22.   1. 198. -45. 167.]
1200 Running reward for the 5 arms of the bandit: [-19.  10. 248. -46. 188.]
1400 Running reward for the 5 arms of the bandit: [-23.  17. 278. -44. 229.]
1600 Running reward for the 5 arms of the bandit: [-29.  15. 321. -56. 270.]
1800 Running reward for the 5 arms of the bandit: [-46.  12. 362. -60. 315.]
2000 Running reward for the 5 arms of the bandit: [-51.  18. 408. -62. 352.]
2200 Running reward for the 5 arms of the bandit: [-58.  16. 455. -76. 392.]
2400 Running reward for the 5 arms of the bandit: [-74.   9. 508. -85. 429.]
2600 Runnin