In [None]:
using Random
using Distributions
using Plots

In [None]:
# Plant the random seed for reproducibility
USE_RANDOM_SEED = false
RANDOM_SEED_VALUE = 421
if USE_RANDOM_SEED
    Random.seed!(RANDOM_SEED_VALUE)
end

In [None]:
# Parameters
K = 4  # Number of arms
T = 1000 # Number of timesteps
AVG_MIN = 0
AVG_MAX = 1
STD_MIN = 0.01
STD_MAX = 0.1
LOW = 0
UP = 1

In [None]:
# Hyperparameters
epsilon = 0.1 # Exploration instead of exploitation
epsilon_decay = 0.99 # Incremental decrease of epsilon

In [None]:
arm_avg = zeros(K)
arm_std = zeros(K)
arm_expectation = zeros(K)

p = plot()
x = 0:1e-2:1
max_expected_value = 0

for k in 1:K
    # Define distributions
    arm_avg[k] = rand(Uniform(AVG_MIN, AVG_MAX))
    arm_std[k] = rand(Uniform(STD_MIN, STD_MAX))
    # Plot distributions
    d = Truncated(Normal(arm_avg[k], arm_std[k]), LOW, UP)
    plot!(x, pdf.(d, x))
        
    arm_expectation[k] = mean(d)
    println(cdf(d, 1)) # Verify that the cumulative distribution function is equal to one
    println(arm_expectation[k]) # Expected value (\sum x_i * P(X=x_i))
    
    if arm_expectation[k] > max_expected_value
        max_expected_value = arm_expectation[k]
    end
end

display(p)
println(max_expected_value)

In [None]:
Q = zeros(K)
rewards = zeros(T)
rewards_cumulated = zeros(T)
regret = zeros(T)
previous_regret = 0
previous_reward = 0
pulled_arm = zeros(T)
random_rewards_cumulated = zeros(T)
previous_random = 0
quality_table = zeros(T, K)

rewards_per_arm = zeros(K)
pull_per_arm = zeros(K)

for t in 1:T
    
    if rand(Uniform(0, 1)) < epsilon
        # Exploration
        k_select = rand(1:K)
    else
        # Exploitation (TODO: randomly break ties)
        k_select = mapslices(argmax, Q, dims = 1)[1]
    end
    
    pulled_arm[t] = k_select
    
    # Obtain reward
    rewards[t] = rand(Truncated(Normal(arm_avg[k_select], arm_std[k_select]), LOW, UP), 1)[1]

    rewards_per_arm[k_select] += rewards[t]
    pull_per_arm[k_select] += 1
    
    # Update the quality table
    #Q[k_select] = Q[k_select] + (rewards[t] - Q[k_select]) / t
    Q[k_select] = rewards_per_arm[k_select] / pull_per_arm[k_select]
    quality_table[t:T, k_select] .= Q[k_select]
    
    # Update the cumulated rewards
    rewards_cumulated[t] = previous_reward + rewards[t]
    previous_reward = rewards_cumulated[t]
    
    # Update epsilon
    epsilon *= epsilon_decay
    
    # Regret
    regret[t] = previous_regret + max_expected_value - arm_expectation[k_select]
    previous_regret = regret[t]
    
    #Random
    k_select = rand(1:K)
    reward = rand(Truncated(Normal(arm_avg[k_select], arm_std[k_select]), LOW, UP), 1)[1]
    random_rewards_cumulated[t] = previous_random + reward
    previous_random = random_rewards_cumulated[t]
    
end

t = 1:T

maximal_expectation_cumulated = t * max_expected_value
expected_rewards = ones(T) * max_expected_value

p = plot()
plot!(t, rewards, label = "rewards")
plot!(t, expected_rewards, label = "expected_rewards")
display(p)

p = plot(xlabel="Timesteps", ylabel="Cumulated rewards")
plot!(t, rewards_cumulated, label = string("Cumulated rewards = ", rewards_cumulated[T]))
plot!(t, maximal_expectation_cumulated, label = string("Maximal expectation = ", maximal_expectation_cumulated[T]))
plot!(t, random_rewards_cumulated, label = string("Pulling at random = ", random_rewards_cumulated[T]))
plot!(t, regret, label = string("Regret = ", regret[T]))
display(p)

p = plot()
plot!(t, pulled_arm, label = "pulled_arm")
display(p)

p = plot()
for k in 1:K
    plot!(t, quality_table[:, k], label = string(k))
    scatter!([T], [arm_expectation[k]])
end
display(p)