# Multi-Armed Bandit

In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

## Bandit 정의

In [6]:
bandit_arms = [0.2,0,-0.2,-2] # 밴딧의 손잡이 목록
num_arms = len(bandit_arms)

# bandit이 작을수록 +의 반환이 될 확률이 높음. '
# 따라서 현재 손잡이에선 인덱스 3의 손잡이가 확률 가장 높음
def pullBandit(bandit):
    # bandit에 따라 받게 되는 리워드 (1 또는 -1)
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward. 
        return 1 # +
    else:
        #return a negative reward.
        return -1 # -

우리가 원하는 것은 에이전트가 언제나 +의 보상을 가져올 손잡이를 선택하도록 학습하는 것이다.

## 에이전트
간단한 신경망 구현. 

$ Loss = -log(파이)*A $

In [7]:
tf.reset_default_graph()

# 네트워크의 피드포워드 부분 구현
weights = tf.Variable(tf.ones([num_arms]))
output = tf.nn.softmax(weights)

# 학습 과정 구현
# 보상과 선택된 액션을 네트워크에 피드해줌으로써 비용을 계산하고,
# 비용을 이용해 네트워크 가중치를 update함
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)

responsible_output = tf.slice(output,action_holder,[1])
loss = -(tf.log(responsible_output)*reward_holder)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

## 에이전트 학습

In [9]:
total_episodes = 1000 #Set total number of episodes to train agent on.

# 각 손잡이에 대한 reward를 0으로 초기화
total_reward = np.zeros(num_arms) #Set scoreboard for bandit arms to 0.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        # 볼츠만 분포에 따라 액션 선택
        actions = sess.run(output)
        a = np.random.choice(actions,p=actions)
        action = np.argmax(actions == a)
        
        # 밴딧 손잡이 중 하나를 선택해서 보상을 받음
        reward = pullBandit(bandit_arms[action]) #Get our reward from picking one of the bandit arms.
        
        # 네트워크 업데이트
        _,resp,ww = sess.run([update,
                              responsible_output,
                              weights], 
                              feed_dict={reward_holder:[reward],
                                        action_holder:[action]})
        
        # total reward 업데이트
        total_reward[action] += reward
        if i % 50 == 0:
            print("Running reward for the " + str(num_arms) + " arms of the bandit: " + str(total_reward))
        i+=1
print("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandit_arms)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Running reward for the 4 arms of the bandit: [1. 0. 0. 0.]
Running reward for the 4 arms of the bandit: [-2. -4.  0. 11.]
Running reward for the 4 arms of the bandit: [-2.  0.  5. 24.]
Running reward for the 4 arms of the bandit: [-3.  1.  3. 36.]
Running reward for the 4 arms of the bandit: [-3.  4.  3. 47.]
Running reward for the 4 arms of the bandit: [-6. 10. -2. 59.]
Running reward for the 4 arms of the bandit: [-9. 15.  5. 68.]
Running reward for the 4 arms of the bandit: [-7. 16.  4. 84.]
Running reward for the 4 arms of the bandit: [-13.  12.   6.  98.]
Running reward for the 4 arms of the bandit: [-14.  15.   4. 106.]
Running reward for the 4 arms of the bandit: [-17.  15.   5. 124.]
Running reward for the 4 arms of the bandit: [-15.  22.   7. 137.]
Running reward for the 4 arms of the bandit: [ -9.  16.  11. 153.]
Running reward for the 4 arms of the bandit: [ -7.  15.  15. 174.]
Running reward for the 4 arms of the bandit: [-11.  16.  22. 186.]
Running reward for the 4 arms o

In [11]:
ww

array([0.78322226, 0.844812  , 0.9155914 , 1.4349186 ], dtype=float32)

In [15]:
-np.array(bandit_arms)

array([-0.2, -0. ,  0.2,  2. ])