<a href="https://colab.research.google.com/github/MuleHakim/Reinforcement-Learning/blob/main/UCB_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5. UCB Algorithm

### Grid World (FrozenLake Environment)

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
import gymnasium as gym
import numpy as np

In [3]:
# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

In [4]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))
action_counts = np.zeros((env.observation_space.n, env.action_space.n))
c = 2  # Confidence level
num_episodes = 10000
max_steps = 100
gamma = 0.99  # Discount factor


In [5]:
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    for step in range(max_steps):
        total_count = np.sum(action_counts[state, :]) + 1
        ucb_values = q_table[state, :] + c * np.sqrt(np.log(total_count) / (action_counts[state, :] + 1e-5))
        action = np.argmax(ucb_values)

        new_state, reward, done, _, _ = env.step(action)

        action_counts[state, action] += 1
        q_table[state, action] += (1 / action_counts[state, action]) * (
                    reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])

        state = new_state

        if done:
            break

In [6]:
print("Q-Table using UCB:", q_table)

Q-Table using UCB: [[1.86696771e-03 1.48643119e-02 8.77712741e-01 1.86699099e-03]
 [3.96387799e-04 0.00000000e+00 9.40428907e-01 4.12004431e-03]
 [8.93922227e-01 9.65640383e-01 9.10794262e-01 9.43002345e-01]
 [9.45907657e-01 0.00000000e+00 9.14468078e-01 9.14468078e-01]
 [3.96936286e-03 2.77257108e-01 0.00000000e+00 4.04813370e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.79341074e-01 0.00000000e+00 9.42122681e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.77900948e-01 0.00000000e+00 9.46801072e-01 1.03509709e-01]
 [8.83571049e-01 9.78034614e-01 9.78662359e-01 0.00000000e+00]
 [9.63095715e-01 9.89929326e-01 0.00000000e+00 9.65672095e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.78585398e-01 9.89852481e-01 9.62769644e-01]
 [9.78625643e-01 9.89863260e-01 1.00000000e+00 9.79313060e-01]
 [0.00000000e+00 0.00000000e+00 0.00

### Single-State Multi-Armed Bandit


In [7]:
class UCBBandit:
    def __init__(self, k, c=2):
        self.k = k
        self.c = c
        self.q_values = np.zeros(k)
        self.action_counts = np.zeros(k)
        self.total_count = 0

    def select_action(self):
        self.total_count += 1
        # Calculate UCB value for each action
        ucb_values = self.q_values + self.c * np.sqrt(np.log(self.total_count) / (self.action_counts + 1e-5))
        return np.argmax(ucb_values)

    def update_q_value(self, action, reward):
        self.action_counts[action] += 1
        self.q_values[action] += (1 / self.action_counts[action]) * (reward - self.q_values[action])

In [8]:
def simulate_bandit(bandit, arms, num_steps):
    rewards = np.zeros(num_steps)
    for step in range(num_steps):
        action = bandit.select_action()
        reward = np.random.normal(arms[action], 1.0)  # Assume normal distribution with unit variance
        bandit.update_q_value(action, reward)
        rewards[step] = reward
    return rewards

In [9]:
# Define the mean rewards for the arms
arms = [1.0, 1.5, 2.0, 0.5, 1.2]
bandit = UCBBandit(k=len(arms))
rewards = simulate_bandit(bandit, arms, num_steps=1000)

In [10]:
print("Total Reward:", np.sum(rewards))


Total Reward: 1976.472465072935


In [11]:
print("Q-Values using UCB for Bandit:", bandit.q_values)


Q-Values using UCB for Bandit: [ 0.67807924  1.09631757  2.05217432 -0.0393629   1.2295482 ]
