In [6]:
import gym_bandits
import gym
import numpy as np
import math
import random

In [2]:
env = gym.make("BanditTenArmedGaussian-v0")

In [3]:
env.action_space

Discrete(10)

## 1. Epsilon-greedy Policy

In [4]:
def epsilon_greedy(epsilon):
    rand = np.random.random()
    if rand < epsilon:
        action = env.action_space.sample() # exploration - 가능한 action 중 하나 랜덤하게 실행
    else:
        action = np.argmax(Q) # exploitation
    
    return action

### Initialize all the necessary variables

In [8]:
# number of rounds (iterations)
num_rounds = 20000

# Count of number of times an arm was pulled
count = np.zeros(10)

# Sum of rewards of each arm
sum_rewards = np.zeros(10)

# Q value which is the average reward
Q = np.zeros(10)

### Start pulling the arm!

In [12]:
for i in range(num_rounds):
    # Select the arm using epsilon greedy
    arm = epsilon_greedy(0.5) # 반반 확률로 explore vs exploit
    
    # Get the reward
    observation, reward, done, info = env.step(arm)
    
    # Update the count of that arm
    count[arm] += 1
    
    # Sum the rewards obtained from the arm
    sum_rewards[arm] += reward
    
    # Calculate Q value which is the average rewrads of the arm
    Q[arm] = sum_rewards[arm]/count[arm]

print('The optimal arm is {}'.format(np.argmax(Q)))

The optimal arm is 0


* 어떤 strategy를 사용하든 같은 결과가 나와야 함! (같은 environment를 사용하기 때문)

## 2. Softmax Exploration Algorithm

In [13]:
def softmax(tau):
    total = sum([math.exp(val/tau) for val in Q])
    probs = [math.exp(val/tau)/total for val in Q]
    
    threshold = random.random()
    cumulative_prob = 0.0
    for i in range(len(probs)):
        cumulative_prob += probs[i]
        if (cumulative_prob > threshold):
            return i
    return np.argmax(probs)

### Initialize all the necessary variables

In [14]:
# number of rounds (iterations)
num_rounds = 20000

# Count of number of times an arm was pulled
count = np.zeros(10)

# Sum of rewards of each arm
sum_rewards = np.zeros(10)

# Q value which is the average reward
Q = np.zeros(10)

### Start pulling the arm!

In [15]:
for i in range(num_rounds):
    # Select the arm using softmax
    arm = softmax(0.5)
    
    # Get the reward
    observation, reward, done, info = env.step(arm)
    
    # Update the count of that arm
    count[arm] += 1
    
    # Sum the rewards obtained from the arm
    sum_rewards[arm] += reward
    
    # Calculate Q value which is the average rewards of the arm
    Q[arm] = sum_rewards[arm]/count[arm]
    
print('The optimal arm is {}'.format(np.argmax(Q)))

The optimal arm is 0


## 3. Upper Confidence Bound Algorithm

In [16]:
def UCB(iters):
    ucb = np.zeros(10)
    
    # Explore all the arms
    if iters < 10:
        return iters
    
    else:
        for arm in range(10):
            # Calculate upper bound
            upper_bound = math.sqrt((2 * math.log(sum(count))) / count[arm])
            
            # Add upper bound to the Q value
            ucb[arm] = Q[arm] + upper_bound
        
        # Return the arm which has maximum value
        return (np.argmax(ucb))

### Initialize all the necessary variables

In [17]:
# Number of rounds (iteratinos)
num_rounds = 20000

# Count of number of times an arm was pulled
count = np.zeros(10)

# Sum of rewards of each arm
sum_rewards = np.zeros(10)

# Q value which is the average reward
Q = np.zeros(10)

### Start pulling the arm!

In [18]:
for i in range(num_rounds):
    # Select the arm using UCB
    arm = UCB(i)
    
    # Get the reward
    observation, reward, done, info = env.step(arm)
    
    # Update the count of that arm
    count[arm] += 1
    
    # Sum the rewards obtained from the arm
    sum_rewards[arm] += reward
    
    # Calculate Q value which is the average rewards of the arm
    Q[arm] = sum_rewards[arm] / count[arm]

print('The optimal arm is {}'.format(np.argmax(Q)))

The optimal arm is 0


## 4. Thompson Sampling Algorithm

In [19]:
def thompson_sampling(alpha, beta):
    samples = [np.random.beta(alpha[i], beta[i]) for i in range(10)] # beta distribution은 사전에 정의되어 있음
    return np.argmax(samples)

### Initialize all the necessary variables

In [21]:
# Number of rounds (iterations)
num_rounds = 20000

# Count of number of times an arm was pulled
count = np.zeros(10)

# Sum of rewards of each arm
sum_rewards = np.zeros(10)

# Q value which is the average reward
Q = np.zeros(10)

# Initialize alpha and beta values
alpha = np.ones(10)
beta = np.ones(10)

### Start pulling the arm!

In [22]:
for i in range(num_rounds):
    # Select the arm using thompson sampling
    arm = thompson_sampling(alpha, beta)
    
    # Get the reward
    observation, reward, done, info = env.step(arm)
    
    # Update the count of that arm
    count[arm] += 1
    
    # Sum the rewards obtained from the arm
    sum_rewards[arm] += reward
    
    # Calculate Q value which is the average rewards of the arm
    Q[arm] = sum_rewards[arm] / count[arm]
    
    # If it is a positive reward increment alpha
    if reward > 0:
        alpha[arm] += 1
    
    # If it is a negative reward increment beta
    else:
        beta[arm] += 1

print('The optimal arm is {}'.format(np.argmax(Q)))

The optimal arm is 0
