## Multi-armed Bandit Problem --> epsilon greedy

#### - Constraints
    
    1) 각 arm은 각기 다른 reward를 제공      
    2) 제한된 시간 내에 제한된 횟수만큼 arm을 이용      
    3) 한 번에 하나의 arm을 당길 수 있음
    
#### - Objective

    : 정해진 시간 내에 총 reward를 maximize하는 policy를 찾는 것

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def pullBandit(bandit) :
    result = np.random.randn(1)
    
    if result > bandit :
        return 1
    else :
        return -1

In [3]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

### ***The agent***

In [5]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, 0)

reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
action_holder = tf.placeholder(shape=[1], dtype=tf.int32)

responsible_weight = tf.slice(weights, action_holder, [1])

loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

### ***Training***

In [6]:
EPOCHS = 1000
total_reward = np.zeros(num_bandits)
e = 0.1    # epsilon

init = tf.global_variables_initializer()

with tf.Session() as sess :
    sess.run(init)
    
    for i in range(EPOCHS) :
        if np.random.rand(1) < e :
            action = np.random.randint(num_bandits)
        else :
            action = sess.run(chosen_action)
            
        reward = pullBandit(bandits[action])
        _, resp, W = sess.run([update, responsible_weight, weights], feed_dict={reward_holder: [reward], action_holder: [action]})
        total_reward[action] += reward
    
        if i%50 == 0 :
            print("Running reward for the ", num_bandits, " bandits: ", total_reward)

Running reward for the  4  bandits:  [-1.  0.  0.  0.]
Running reward for the  4  bandits:  [-1.  1. -1. 42.]
Running reward for the  4  bandits:  [ 0.  0. -2. 89.]
Running reward for the  4  bandits:  [ -2.   1.  -3. 135.]
Running reward for the  4  bandits:  [ -2.   1.  -5. 181.]
Running reward for the  4  bandits:  [ -1.  -1.  -5. 224.]
Running reward for the  4  bandits:  [ -2.  -1.  -5. 269.]
Running reward for the  4  bandits:  [ -2.  -2.  -6. 313.]
Running reward for the  4  bandits:  [ -1.  -5.  -5. 358.]
Running reward for the  4  bandits:  [ -1.  -4.  -5. 407.]
Running reward for the  4  bandits:  [ -3.  -3.  -5. 454.]
Running reward for the  4  bandits:  [ -3.  -3.  -4. 503.]
Running reward for the  4  bandits:  [ -2.  -4.  -3. 550.]
Running reward for the  4  bandits:  [ -1.  -4.  -2. 596.]
Running reward for the  4  bandits:  [ -1.  -6.  -1. 639.]
Running reward for the  4  bandits:  [ -1.  -6.   0. 688.]
Running reward for the  4  bandits:  [ -1.  -6.   0. 736.]
Running r

#### ***Prediction***

In [7]:
if np.argmax(W) == np.argmax(-np.array(bandits)) :
    print("Right")
else :
    print("Wrong")

Right
