In [None]:
# Class-type for the medical bottles experiment
# Fifth version by JAGR.
# Sample average method to update action-values (Q-values) using ARGMAX action selection and q_values initialized to [1.0] Explotation-only.
# Here, the agent always exploits the current knowledge without exploration.
# With q_values initialized optimistically, force the agent  to try all bottles at least once. Most of the time the agent exploits the best-known bottle.

import random

class bottle:
    def __init__(self, name, real_prob):
        # Constructor: it executes when you create a new bottle
        # Here we define the initial states of the bottle. Attributes.
        self.name = name
        self.real_prob = real_prob

        self.q_value = 1.0  # Initial estimate of the value of the bottle
        self.count = 0      # Number of times the bottle has been used
        self.accum_reward = 0.0  # Total reward obtained from this bottle

    def try_bottle(self):
        # Simulate trying the bottle
        if random.random() < self.real_prob:
            return 1  # Reward of 1 if successful
        else:
            return 0  # Reward of 0 if not successful

    def learn(self, reward):
        # Update the Q-value based on the received reward
        self.count += 1
        self.accum_reward += reward
        if self.count > 0:
            self.q_value = self.accum_reward / self.count  # Update Q-value as average reward
    
    def __str__(self):
        return f"Bottle {self.name} -> Q: {self.q_value:.4f} (Chosen {self.count} times)"

In [None]:
# Create bottles with different real probabilities

import numpy as np

bottles = [
    bottle("P", 0.25), 
    bottle("Y", 0.75), 
    bottle("B", 0.50)
    ]

tried_bottles_history = []                              # To record history of tried bottles
q_values_history = []                                   # To record history of Q-values    
accumulated_rewards_history = []                        # To record accumulated rewards

n_steps = 10000
for t in range(1, n_steps+1):
    
    selected_bottle = bottles[0]
    for b in bottles:
        if b.q_value > selected_bottle.q_value:
            selected_bottle = b    

    reward = selected_bottle.try_bottle()
    selected_bottle.learn(reward)
    tried_bottles_history.append(selected_bottle.name)
    q_values_history.append([i.q_value for i in bottles])
    accumulated_rewards_history.append([j.accum_reward for j in bottles])
    #print(f"Iteration {t}: Tried bottle {selected_bottle.name}, Reward: {reward}, Q-value: {selected_bottle.q_value:.4f}, Count: {selected_bottle.count}, Accumulated Reward: {selected_bottle.accum_reward}")

for i in bottles:
    print(i)

In [None]:
import matplotlib.pyplot as plt

plt.hist(tried_bottles_history, bins=np.arange(-0.5, 3.5, 1), rwidth=0.8)
plt.xticks([0, 1, 2], ['P', 'Y', 'B'])
plt.grid(axis='y')
plt.xlabel('Bottles')
plt.ylabel('Number of times tried')
plt.title('Histogram of Tried Bottles')
plt.show()

In [None]:
plt.plot(range(1, n_steps+1), np.array(q_values_history)[:,0], label='Bottle P')
plt.plot(range(1, n_steps+1), np.array(q_values_history)[:,1], label='Bottle Y')
plt.plot(range(1, n_steps+1), np.array(q_values_history)[:,2], label='Bottle B')
plt.grid()
plt.xlabel('Simulation Steps')         
plt.ylabel('Q-values')
plt.title('$Q_t(a)$ estimation over Time - Sample Average Method')
plt.legend()
plt.show()

In [None]:
plt.plot(range(1, n_steps+1), np.array(accumulated_rewards_history)[:,0], label='Bottle P')
plt.plot(range(1, n_steps+1), np.array(accumulated_rewards_history)[:,1], label='Bottle Y')
plt.plot(range(1, n_steps+1), np.array(accumulated_rewards_history)[:,2], label='Bottle B')
plt.grid()
plt.xlabel('Simulation Steps')         
plt.ylabel('Accumulated Rewards')
plt.title('Accumulated Rewards - Sample Average Method')
plt.legend()
plt.show()