In [None]:
from multi_armed_bandit.bandit import *
from multi_armed_bandit.agent import *
from multi_armed_bandit.policy import *

In [None]:
from pathos.multiprocessing import ProcessingPool as Pool
MAX_STEPS = 10000

def run_agent(agent):
    for i in range(MAX_STEPS):
        agent.pull()
    return agent

In [None]:
import matplotlib.pyplot as plt
def plot_result(result, ymax=2.0, ymin=0.0, legend_x=1.4, legend_y=1.0):
    num_exp, num_policy = np.shape(result)
    #Plot average reward_plot
    for pol in range(num_policy):
        expected_reward = np.zeros(result[0][pol].step)
        for exp in range(num_exp):
            num_step = result[exp][pol].step+1
            expected_reward += result[exp][pol].reward_cumulated/np.arange(1, num_step)
        expected_reward /= num_exp
        plt.plot(expected_reward, label=result[0][pol].name)
    plt.xlabel('iterations')
    plt.ylabel('average reward')
    plt.ylim(ymin, ymax)
    plt.legend(loc='upper right', bbox_to_anchor=(legend_x, legend_y))
    plt.title('Average Reward of Policies')
    plt.show()

    for pol in range(num_policy):
        optimal_ratio = np.zeros(result[0][pol].step)
        for exp in range(num_exp):
            num_step = result[exp][pol].step+1
            optimal_ratio += result[exp][pol].optimal_cumulated/np.arange(1, num_step)
        optimal_ratio /= num_exp
        plt.plot(optimal_ratio, label=result[0][pol].name)
    plt.xlabel('iterations')
    plt.ylabel('\% of optimal action')
    plt.ylim(0.0, 1.0)
    plt.legend(loc='upper right', bbox_to_anchor=(legend_x, legend_y))
    plt.title('% of Optimal Action of Policies')
    plt.show()

In [None]:
#1. Comparision of Various Policies
bandit = GaussianBandit(k=10)
policies = [Greedy_Policy(), e_Greedy_Policy(0.1), e_Greedy_Policy(0.05), e_Greedy_Policy(0.01),\
            Random_Policy(), UCB_Policy(1.0), UCB_Policy(2.0)]
agents = [Agent(bandit.k, policy, bandit=bandit) for policy in policies]
p = Pool(len(agents))

result = []
from tqdm import tqdm
for i in tqdm(range(25)):
    for agent in agents:
        agent.reset()
    _result = p.map(run_agent, agents)
    result.append(_result)

In [None]:
plot_result(result, 2.0)

In [None]:
#2. Comparision of Initialization of Greedy
bandit = GaussianBandit(k=10)
policies = [Greedy_Policy(), Greedy_Policy(), Greedy_Policy(), Greedy_Policy()]
init_values = [0.0, 2.5, 5.0, 10.0]
agents = [Agent(bandit.k, policy, init_value=init_value, bandit=bandit)\
          for policy, init_value in zip(policies, init_values)]

p = Pool(len(agents))

result = []
from tqdm import tqdm
for i in tqdm(range(25)):
    for agent in agents:
        agent.reset(init_type=agent.init_type, value=agent.init_value, std=agent.init_std)
    _result = p.map(run_agent, agents)
    result.append(_result)

In [None]:
plot_result(result, 2.0)

In [None]:
from multi_armed_bandit.gradient_agent import *

In [None]:
#3. Comparision of GradientBanditAgent
policies = [Greedy_Policy(), Greedy_Policy(), Greedy_Policy()]
gammas = [0.05, 0.1, 0.4]

agents += [GradientBanditAgent(bandit.k, policy, gamma=gamma, bandit=bandit)\
          for policy, gamma in zip(policies, gammas)]

policies = [Greedy_Policy(), Greedy_Policy(), Greedy_Policy()]
gammas = [0.05, 0.1, 0.4]
agents += [GradientBanditAgent(bandit.k, policy, gamma=gamma, baseline_type=None, bandit=bandit)\
          for policy, gamma in zip(policies, gammas)]

p = Pool(len(agents))

result = []
from tqdm import tqdm
for i in tqdm(range(25)):
    for agent in agents:
        agent.reset(init_type=agent.init_type, value=agent.init_value, std=agent.init_std)
    _result = p.map(run_agent, agents)
    result.append(_result)


In [None]:
plot_result(result, 1.5, legend_x=1.8, legend_y=1.0)