# Exercise 7

In [68]:
# Task is to implement a 10-armed bandit problem and use reinforcement learning, to solve it
# Therefore we have n = 10 possible actions with a mean reward of Q*(a) drawn from a normal distribution Gauss(0,1)
# The actual reward of an action a is drawn from the distribution Gauss(Q*(a), 1)

# The algorithm is set to perform 1000 plays.
# These plays get repeated 2000 times and the results get averaged.
# The experiment should be run with epsilons of 0.1, 0.01 and 0.009
# Furthermore the average number of times that the optinal action was selected should be plotted

import numpy as np
import random

In [108]:
def get_lambda():
    mean = np.random.normal()
    print(mean)
    return lambda : np.random.normal(mean)

def initialize_bandit(number_of_arms):
    actions = [get_lambda() for i in range(number_of_arms)]
    return actions

In [121]:
#just to see if distributions draw correctly
ac = initialize_bandit(10)
list(map(lambda y: y(),ac))

ls = [list(map(lambda y: y(),ac)) for i in range(10000)]
list(map(lambda x: np.mean(x), list(zip(*ls))))

-0.6280640320832023
0.7750180044403907
2.1150021778629577
1.5470213855481731
0.9120740886427431
0.5029274911782863
0.6655175245464807
0.49910145264702793
-1.262259938891504
0.5114333454823011


[-0.6185543715314937,
 0.777651490199546,
 2.0826528908812016,
 1.5467261094912934,
 0.9181973294174411,
 0.4973147046933743,
 0.6785826872486319,
 0.4905456838936361,
 -1.2712134932448516,
 0.5273007946916204]

In [122]:
#algorith as described by exercise sheet

def a_simple_bandit_algo(arms, epsilon):
    number_of_arms = len(arms)
    Q_vec = np.array([0.0 for i in range(number_of_arms)])
    N_vec = np.array([0.0 for i in range(number_of_arms)])
    reward = 0
    reward_vec = []
    for i in range(1000):
        if epsilon > random.random():
            arm_num = np.argmax(Q_vec)
            new_reward = arms[arm_num]()
            reward += new_reward
            reward_vec.append(reward)
            N_vec[arm_num] = N_vec[arm_num] + 1
            Q_vec[arm_num] = Q_vec[arm_num] + (1/N_vec[arm_num])*(new_reward-Q_vec[arm_num])
        else:
            arm_num = random.randint(0, number_of_arms-1)
            new_reward = arms[arm_num]()
            reward += new_reward
            reward_vec.append(reward)
            N_vec[arm_num] = N_vec[arm_num] + 1
            Q_vec[arm_num] = Q_vec[arm_num] + (1/N_vec[arm_num])*(new_reward-Q_vec[arm_num])
    return (Q_vec, N_vec, reward_vec)

In [123]:
#returns expected rewards for actions, number of times action got taken and the rewards over 1000 playes averaged over 2000 samples
def test_for_epsilon(bandits, number_of_arms, epsilon):
    
    Q_vec = np.array([0 for i in range(number_of_arms)])
    N_vec = np.array([0 for i in range(number_of_arms)])
    reward = np.array([0.0 for i in range(1000)])
    for i in range(2000):
        Q,N,R = a_simple_bandit_algo(bandits, epsilon)
        Q_vec = Q_vec + Q
        N_vec = N_vec + N
        reward += np.array(R)
    Q_vec = list(map(lambda x: x/1000.0, Q_vec))
    N_vec = list(map(lambda x: x/1000.0, N_vec))
    reward = list(map(lambda x: x/1000.0, reward))
    return (Q_vec,N_vec,reward)

In [124]:
#Test same bandits for three different epsilon

bandits = initialize_bandit(10)

Sample1 = test_for_epsilon(bandits, 10, 0.1)
Sample2 = test_for_epsilon(bandits, 10, 0.01)
Sample3 = test_for_epsilon(bandits, 10, 0.009)

3.0857618619404144
0.5381166414055172
0.9993809717704768
-0.16751084095923807
0.48006802669114634
-0.4658682233086033
0.7979937955582801
0.5374251282016688
-1.0641870292795732
0.46864039869782353
