## **Upper Confidence Bound (UCB) Exploration Algorithm**

Upper Confidence Bound (UCB) is an algorithm that balances exploration and exploitation by considering both the estimated reward of an action and the uncertainty or variance of the estimate. 

**Imports**

In [3]:
import numpy as np
import gym

**Data Loading**

In [None]:
# Environment setup
env = gym.make('CartPole-v1')

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
c = 2  # Exploration factor in UCB

**Model Building**

In [None]:
# Q-table initialization and counters
Q = np.zeros((env.observation_space.shape[0], env.action_space.n))
N = np.zeros((env.observation_space.shape[0], env.action_space.n))  # Action count

def ucb_policy(state):
    total_actions = np.sum(N[state])
    if total_actions == 0:  # No actions taken yet, explore
        return np.random.choice(env.action_space.n)
    else:
        ucb_values = Q[state] + c * np.sqrt(np.log(total_actions + 1) / (N[state] + 1))
        return np.argmax(ucb_values)

def ucb_q_learning(env, n_episodes=1000):
    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            action = ucb_policy(state)  # Select action based on UCB policy
            next_state, reward, done, _ = env.step(action)

            # Update Q-value and action count
            N[state, action] += 1
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])