In [1]:
import numpy as np
import pandas as pd
import random
np.random.seed(42)
random.seed(42)

### Creation of our own env with arms_length as the action_space

In [2]:
class ENV1:
    def __init__(self,arms_length):
        #create array of 12 arms
        self.arms = [i+1 for i in range(arms_length)]
    def arms_return(self):
        return self.arms
    def reward_return(self):
        #generate a random reward between -10 and 25
        return np.random.randint(-10,26)
    def print_action(self,arm_index):
        #print the action taken
        print(f"Arm pulled is {self.arms[arm_index]} \n")

### Define the epsilon greedy and ucb policy

In [3]:
def epsilon_greedy(Q_values,N_values,epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(len(Q_values))
    else:
        return np.argmax(Q_values)

    
def ucb(Q_values,N_values,steps,c):
    ucb_list = []
    for i in range(len(Q_values)):
        if N_values[i] == 0:
            ucb_list.append(np.inf)
        else:
            ucb_list.append(Q_values[i]+ c * (np.sqrt(2 * np.log(steps+1) / N_values[i])))
    return np.argmax(ucb_list)
                

### Define the variables

In [4]:
steps=0
arms_length = 12
env = ENV1(arms_length)
Q_values = np.zeros(arms_length)
N_values = np.zeros(arms_length)
epsilon = 0.7
c = 4.2

### Run the policies

In [7]:
def run_epsilon_policy(Q_values,N_values,epsilon,env,steps):
    print("running the epsilon greedy policy \n")
    epsilon_reward = 0
    while steps<5000:
        print(f"step: {steps+1} \n")
        action = epsilon_greedy(Q_values,N_values,epsilon)
        print(f"the action arm chosen is {action+1} \n")
        N_values[action]+=1
        env.print_action(action)
        reward = env.reward_return()
        print(f"the reward got is  {reward} \n")
        Q_values[action] +=(reward - Q_values[action]) /N_values[action]
        epsilon_reward+=reward
        steps+=1
    print("Epsilon greedy policy for 5000 steps done \n")
    return epsilon_reward

def run_ucb_policy(Q_values,N_values,c,env,steps):
    print("running the ucb policy \n")
    ucb_reward = 0
    while steps<5000:
        print(f"step: {steps+1} \n")
        action = ucb(Q_values,N_values,steps,c)
        print(f"the action arm chosen is {action+1} \n")
        N_values[action]+=1
        env.print_action(action)
        reward = env.reward_return()
        print(f"the reward got is  {reward} \n")
        Q_values[action] +=(reward - Q_values[action]) /N_values[action]
        ucb_reward+=reward
        steps+=1
    print("UCB policy for 5000 steps done \n")
    return ucb_reward

In [8]:
epsilon_reward = run_epsilon_policy(Q_values.copy(),N_values.copy(),epsilon,env,steps)
ucb_reward = run_ucb_policy(Q_values.copy(),N_values.copy(),c,env,steps)

running the epsilon greedy policy 

step: 1 

the action arm chosen is 7 

Arm pulled is 7 

the reward got is  -10 

step: 2 

the action arm chosen is 1 

Arm pulled is 1 

the reward got is  19 

step: 3 

the action arm chosen is 10 

Arm pulled is 10 

the reward got is  16 

step: 4 

the action arm chosen is 1 

Arm pulled is 1 

the reward got is  25 

step: 5 

the action arm chosen is 3 

Arm pulled is 3 

the reward got is  25 

step: 6 

the action arm chosen is 4 

Arm pulled is 4 

the reward got is  14 

step: 7 

the action arm chosen is 4 

Arm pulled is 4 

the reward got is  6 

step: 8 

the action arm chosen is 12 

Arm pulled is 12 

the reward got is  20 

step: 9 

the action arm chosen is 4 

Arm pulled is 4 

the reward got is  -7 

step: 10 

the action arm chosen is 9 

Arm pulled is 9 

the reward got is  -10 

step: 11 

the action arm chosen is 3 

Arm pulled is 3 

the reward got is  23 

step: 12 

the action arm chosen is 7 

Arm pulled is 7 

the rewa

### printing the rewards

In [9]:
print(f"the total cumulative reward for epsilon greedy policy is: {epsilon_reward} \n")
print(f"the total cumulative reward for UCB policy is: {ucb_reward} \n")

the total cumulative reward for epsilon greedy policy is: 37875 

the total cumulative reward for UCB policy is: 36680 



### Comparing the rewards