In [None]:
# Epsilon-Greedy Action-Value Estimation
# Sample Average Method for Estimating Action Values
# RL Course - JAGR

import random

# Bottles
bottle_actions = ["P", "Y", "B"]
q_values = [0.0, 0.0, 0.0]  
counts = [0, 0, 0] 
acum_rewards = [0, 0, 0] 


# Real World Simulation (Hidden Probabilities, Hidden Distributions)
# Hidden to the agent
real_probabilities=[0.25, 0.75, 0.50]

for t in range(1, 100): # iterations
    selected_index = random.randint(0, 2)   # Random action selection (0, 1, or 2)
    action = bottle_actions[selected_index]

    if random.random() < real_probabilities[selected_index]:
        reward = 1
    else:
        reward = 0

    counts[selected_index] += 1
    acum_rewards[selected_index] += reward

    q_values[selected_index] = acum_rewards[selected_index] / counts[selected_index]

    #print(f"Iteration {t}: Selected Bottle: {action}, Reward: {reward}, Q-values: {q_values}, Acumulated Rewards: {acum_rewards}, Counts: {counts}")

# Final Q-values after all iterations
print(f"Final Q-values: {q_values}")




In [None]:
# Plotting Q-values over Steps
# Plotting version - Same as above but with plotting
# JAGR - RL Course

import matplotlib.pyplot as plt
random.seed(12)

# Bottles
bottle_actions = ["P", "Y", "B"]
q_values = [0.0, 0.0, 0.0]  
counts = [0, 0, 0] 
acum_rewards = [0, 0, 0] 
n_steps = 10000

q_values_history = [[], [], []]  # To store Q-values over time

# Real World Simulation (Hidden Probabilities, Hidden Distributions)
# Hidden to the agent
real_probabilities=[0.25, 0.75, 0.50]

for t in range(1, n_steps): # iterations
    selected_index = random.randint(0, 2)   # Random action selection (0, 1, or 2)
    action = bottle_actions[selected_index]

    if random.random() < real_probabilities[selected_index]:
        reward = 1
    else:
        reward = 0

    counts[selected_index] += 1
    acum_rewards[selected_index] += reward

    q_values[selected_index] = acum_rewards[selected_index] / counts[selected_index]
    
    # Store Q-values for plotting
    for i in range(3):
        q_values_history[i].append(q_values[i])

    #print(f"Iteration {t}: Selected Bottle: {action}, Reward: {reward}, Q-values: {q_values}, Acumulated Rewards: {acum_rewards}, Counts: {counts}")

# Final Q-values after all iterations
print(f"Final Q-values: {q_values}")

# Plot Q-values over time
plt.plot(range(1, n_steps), q_values_history[0], label='Bottle P')
plt.plot(range(1, n_steps), q_values_history[1], label='Bottle Y')
plt.plot(range(1, n_steps), q_values_history[2], label='Bottle B')
plt.xlabel("Simulation Steps (t)")     
plt.ylabel("Estimated average reward ($Q_t$)")
plt.title("Q-values over steps for each bottle")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plotting Percentage of Optimal Action Selection over Steps
# JAGR - RL Course


import random
import matplotlib.pyplot as plt
import numpy as np

random.seed(12)

# Configuration
n_steps = 5000
n_runs = 200  
real_probabilities = [0.25, 0.75, 0.50]
optimal_action = 1  

# Matrix to store if the optimal action was chosen (Rows: runs, Columns: steps)
# Initilized with zeros, will be set to 1 when the optimal action is chosen
optimal_choices_history = np.zeros((n_runs, n_steps))

for run in range(n_runs):
    # Reset for each run
    counts = [0, 0, 0]
    acum_rewards = [0, 0, 0]
    q_values = [0.0, 0.0, 0.0]

    for t in range(n_steps):
        # Policy: Random action selection (0, 1, or 2)
        selected_index = random.randint(0, 2) 

        # Registred if the optimal action was selected
        if selected_index == optimal_action:
            optimal_choices_history[run, t] = 1

        # Simulation of the environment response (reward)
        reward = 1 if random.random() < real_probabilities[selected_index] else 0

        # Update (Sample Average)
        counts[selected_index] += 1
        acum_rewards[selected_index] += reward
        q_values[selected_index] = acum_rewards[selected_index] / counts[selected_index]

# Computing the percentage of optimal action selection over time
# This will give us a curve showing how often the optimal action was chosen at each step, averaged over all runs
percent_optimal_action = np.mean(optimal_choices_history, axis=0) * 100

# Plotting the percentage of optimal action selection over time
plt.figure(figsize=(10, 5))
plt.plot(range(n_steps), percent_optimal_action)
plt.xlabel("Steps")
plt.ylabel("% Optimal Action")
plt.title("Performance: Percentage of Optimal Action Selection (Random Policy)")
plt.ylim(0, 100)
plt.grid(True)
plt.show()