# Context Bandit

In [3]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np1

## 콘텍스트 밴딧
각 밴딧마다 최적의 결과를 내는 다른 action이 요구됨

In [4]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        
        # 밴딧들의 손잡이 목록, 각 밴딧은 손잡이 4, 2, 1이 최적임
        self.bandits = np.array([[0.2,0,-0.0,-5],
                                 [0.1,-5,1,0.25],
                                 [-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        # 각 에피소드에 대해 랜덤한 상태를 반환
        self.state = np.random.randint(0,len(self.bandits)) 
        return self.state
        
    def pullArm(self,action):
        #Get a random number.
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

## 정책 기반 에이전트

In [6]:
class agent():
    def __init__(self, lr, s_size, a_size):
        # 네트워크의 피드포워드 부분.
        # 에이전트는 상태를 받아서 액션을 출력함
        self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
        output = slim.fully_connected(state_in_OH,
                                      a_size,
                                      biases_initializer=None,
                                      activation_fn=tf.nn.sigmoid,
                                      weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.argmax(self.output,0)
        
        
        # 학습 과정 구현
        # 비용을 계산하기 위해 보상과 선택된 액션을 네트워크에 피드하고,
        # 네트워크를 업데이트하는데 에 이를 이용함
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output,
                                           self.action_holder,
                                           [1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

## 에이전트 학습
에이전트는 환경의 상태를 알아내고, 액션을 취하고, 보상을 받음으로써 학습함


In [9]:
tf.reset_default_graph() #Clear the Tensorflow graph.

# 밴딧 로드
cBandit = contextual_bandit() #Load the bandits.

# 에이전트 로드
myAgent = agent(lr=0.001,
                s_size=cBandit.num_bandits,
                a_size=cBandit.num_actions) #Load the agent.

# 네트워크 내부를 들여다보기 위해 평가할 가중치 
weights = tf.trainable_variables()[0] #The weights we will evaluate to look into the network.

# 학습시킬 전체 에피소드 수
total_episodes = 10000 #Set total number of episodes to train agent on.

# 각 밴딧의 각 액션에 대한 점수 초기화
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions]) #Set scoreboard for bandits to 0.

# 랜덤한 액션을 취할 확률
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        # 랜덤 밴딧 (=state) 선택
        s = cBandit.getBandit() #Get a state from the environment.
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            # 랜덤 액션 선택
            action = np.random.randint(cBandit.num_actions)
        else:
            # 에이전트로부터 액션 선택 (0, 1, 2, 3 중에!)
            action = sess.run(myAgent.chosen_action,
                              feed_dict={myAgent.state_in:[s]})
        
        # 주어진 밴딧에 대해 액션을 취한 보상
        reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
        
        # 네트워크 업데이트
        feed_dict={myAgent.reward_holder:[reward],
                   myAgent.action_holder:[action],
                   myAgent.state_in:[s]}
        _,ww = sess.run([myAgent.update,weights], 
                        feed_dict=feed_dict)
        
        # total reward 업데이트
        total_reward[s,action] += reward
        
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
            
        i+=1
        
for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Mean reward for each of the 3 bandits: [-0.25  0.    0.  ]
Mean reward for each of the 3 bandits: [33.75  9.5  30.5 ]
Mean reward for each of the 3 bandits: [73.25 51.   68.  ]
Mean reward for each of the 3 bandits: [108.75  91.5  104.  ]
Mean reward for each of the 3 bandits: [144.   131.25 142.  ]
Mean reward for each of the 3 bandits: [180.5  171.25 175.5 ]
Mean reward for each of the 3 bandits: [221.5  206.75 206.  ]
Mean reward for each of the 3 bandits: [260.25 242.75 239.75]
Mean reward for each of the 3 bandits: [300.5  279.5  276.75]
Mean reward for each of the 3 bandits: [338.25 314.   315.  ]
Mean reward for each of the 3 bandits: [374.5  356.   347.25]
Mean reward for each of the 3 bandits: [410.5  392.25 383.  ]
Mean reward for each of the 3 bandits: [450.25 429.   416.5 ]
Mean reward for each of the 3 bandits: [490.   464.   451.25]
Mean reward for each of the 3 bandits: [524.5  503.5  482.75]
Mean reward for each of the 3 bandits: [562.   541.   519.75]
Mean reward for e

In [8]:
ww

array([[0.99946433, 0.99731004, 0.9994643 , 1.6395718 ],
       [0.9978488 , 1.6376197 , 0.9826807 , 0.9967714 ],
       [1.6385962 , 0.9780429 , 0.974486  , 0.97146916]], dtype=float32)

In [10]:
total_reward

array([[ -26.,    0.,  -17., 3046.],
       [ -22., 2976.,  -58.,   -9.],
       [3042.,  -90.,  -93.,  -93.]])