## Contextual Bandit

#### - constraints

    1) 각 arm은 각기 다른 reward를 제공  
    2) 제한된 시간 내에 제한된 횟수만큼 arm을 이용      
    3) 한 번에 하나의 arm을 당길 수 있음      
    4) bandit은 env.으로부터 random하게 주어짐
    
#### - Objective

    : 정해진 시간 내에 bandit마다 총 reward를 maximize하는 policy를 찾는 것

In [2]:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.slim as slim

tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
class contextual_bandit() :
    def __init__(self) :
        self.state = 0
        self.bandits = np.array([
            [0.2, 0, 0, -5],
            [0.1, -5, 1, 0.25],
            [-5, 5, 5, 5]
        ])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self) :
        self.state = np.random.randint(0, len(self.bandits))
        
        return self.state
    
    def pullArm(self, action) :
        bandit = self.bandits[self.state, action]
        result = np.random.randn(1)
        
        if result < bandit :
            return 1
        else :
            return -1

### ***The policy-based agents***

##### the agent do -->
    1) Observe the env. (Get the current state)      
    2) Take an action by policy      
    3) update the weight to get optimal

In [8]:
class agent() :
    def __init__(self, lr, s_size, a_size) :
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in, s_size)
    
        output = slim.fully_connected(state_in_OH, a_size, biases_initializer=None, activation_fn=tf.nn.sigmoid, weights_initializer=tf.ones_initializer())
        
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        
        self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output, self.action_holder, [1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

### ***Training***

    1) By getting a state from the environment  
    2) Take an action  
    3) Receive a reward

In [9]:
tf.reset_default_graph()

cBandit = contextual_bandit()
myAgent = agent(lr=1e-3, s_size=cBandit.num_bandits, a_size=cBandit.num_actions)
weights = tf.trainable_variables()[0]

EPOCHS = 10000
rAll = np.zeros([cBandit.num_bandits, cBandit.num_actions])
e = 0.1

In [12]:
init = tf.global_variables_initializer()

with tf.Session() as sess :
    sess.run(init)
    
    for i in range(EPOCHS) :
        s = cBandit.getBandit()
        
        if np.random.rand(1) < e :
            action = np.random.randint(cBandit.num_actions)
        else :
            action = sess.run(myAgent.chosen_action, feed_dict={myAgent.state_in: [s]})
            
        reward = cBandit.pullArm(action)
        
        feed_dict = {myAgent.reward_holder: [reward],
                    myAgent.action_holder: [action],
                    myAgent.state_in: [s]}
        
        _, W = sess.run([myAgent.update, weights], feed_dict=feed_dict)
        
        rAll[s, action] += reward
        
        if i%500 == 0 :
            print("Mean reward for each of the ", cBandit.num_bandits, " bandits: ", np.mean(rAll, axis=1))

Mean reward for each of the  3  bandits:  [-0.25  0.    0.  ]
Mean reward for each of the  3  bandits:  [ 0.   21.25 41.  ]
Mean reward for each of the  3  bandits:  [ 2.5  48.75 75.5 ]
Mean reward for each of the  3  bandits:  [  5.    72.   116.75]
Mean reward for each of the  3  bandits:  [ 10.    99.25 158.  ]
Mean reward for each of the  3  bandits:  [ 13.25 122.   198.5 ]
Mean reward for each of the  3  bandits:  [ 21.25 147.   237.  ]
Mean reward for each of the  3  bandits:  [ 25.75 175.5  271.5 ]
Mean reward for each of the  3  bandits:  [ 38.   199.5  309.75]
Mean reward for each of the  3  bandits:  [ 44.75 225.25 347.75]
Mean reward for each of the  3  bandits:  [ 51.75 252.   386.  ]
Mean reward for each of the  3  bandits:  [ 56.5  273.   425.75]
Mean reward for each of the  3  bandits:  [ 66.25 298.5  464.  ]
Mean reward for each of the  3  bandits:  [ 78.   324.   503.75]
Mean reward for each of the  3  bandits:  [ 82.75 352.   545.  ]
Mean reward for each of the  3  ba

### ***Prediction***

In [13]:
for a in range(cBandit.num_bandits) :
    if np.argmax(W[a]) == np.argmin(cBandit.bandits[a]) :
        print("Right")
    else :
        print("Wrong")

Wrong
Wrong
Wrong
