In [1]:
import numpy as np

In [7]:
K=5 # K-Arms
epsilon=0.3 # Exploration rate high
num_impression = 1000

true_click_probabilities = np.array([0.25,0.16, 0.2, 0.28, 0.12])

In [4]:
def epsilon_greedy(epsilon, num_impression):
  estimated_clicks = np.zeros(K)
  num_selection = np.zeros(K)
  clicks = np.zeros(num_impression)

  for impression in range(num_impression):
    if np.random.rand() < epsilon:
      ad = np.random.randint(K) #Explore
    else:
      ad = np.argmax(estimated_clicks) #Exploit

    click = np.random.rand() < true_click_probabilities[ad]
    num_selection[ad] += 1
    estimated_clicks[ad] += (click - estimated_clicks[ad]) # / num_selection[ad]
    clicks[impression] = click

  total_clicks = np.sum(clicks)

  return total_clicks

In [5]:
def ucb(num_impression):
  estimated_clicks = np.zeros(K)
  num_selection = np.zeros(K)
  clicks = np.zeros(num_impression)

  for impression in range(num_impression):
    if impression < K:
      ad = impression
    else:
      ucb_values = estimated_clicks * np.sqrt(np.log(impression) / num_selection)
      ad = np.argmax(ucb_values)

    click = np.random.rand() < true_click_probabilities[ad]
    num_selection[ad] += 1
    estimated_clicks[ad] += (click - estimated_clicks[ad]) # / num_selection[ad]
    clicks[impression] = click

  total_clicks = np.sum(clicks)

  return total_clicks

In [8]:
np.random.seed(10)
epsilon_greedy_clicks = epsilon_greedy(epsilon, num_impression)
ucb_clicks = ucb(num_impression)

print(f"Epsilon Greedy total clicks: {epsilon_greedy_clicks}")
print(f"UCB total clicks: {ucb_clicks}")

Epsilon Greedy total clicks: 245.0
UCB total clicks: 223.0


### Qlearning

In [9]:
import gym
import numpy as np
from IPython.display import display , clear_output
import time

In [10]:
from types import MethodWrapperType
env = gym.make("Taxi-v3")
env.reset()
output = env.render(mode='ansi')
clear_output(wait=True)
print(output)

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+




In [11]:
env = gym.make("Taxi-v3")

num_action = env.action_space.n
num_state = env.observation_space.n

Q = np.zeros((num_state, num_action))

learning_rate = 0.1
discount_rate = 0.99
epsilon = 0.9
epsilon_decay = 0.99
min_epsilon = 0.1

num_episodes = 2

def render_env(env, reward):
  output = env.render(mode='ansi')
  clear_output(wait=True)
  print(output)
  print(f"Reward: {reward}")
  time.sleep(0.5)


for episode in range(num_episodes):
  state = env.reset()
  done = False
  total_reward = 0

  while not done:
    if np.random.rand() < epsilon:
      action = env.action_space.sample()
    else:
      action = np.argmax(Q[state])

    next_state, reward, done, _ = env.step(action)

    best_next_action = np.argmax(Q[next_state])
    Q[state, action] += learning_rate * (reward + discount_rate * Q[next_state, best_next_action] - Q[state, action])

    state = next_state
    render_env(env, reward)
    time.sleep(0.5)

    total_reward += reward

  epsilon *= epsilon_decay
  epsilon = max(min_epsilon, epsilon)

  print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Reward: -10
Episode: 2, Total Reward: -731
