In [None]:
import numpy as np
import matplotlib.pyplot as plt

import random
import time
import math

random.seed(time.time_ns())

class KBandit:
    def __init__(self, k):
        self.K = k
        self.n = 0

        self._initialize_qs()
        self.reward_list = [0] * k
        self.freqs = [0] * k

    def _initialize_qs(self):
        self.probs = []

        for i in range(self.K):
            self.probs.append(random.uniform(0, 1))

    def _update_reward_estimates(self, action_index, new_rew):
        if self.n == 0:
            self.reward_list[action_index] = new_rew
        else:
            self.reward_list[action_index] = self.reward_list[action_index] + (new_rew - self.reward_list[action_index]) / self.n

        self.n += 1

    def action_reward(self, action_index):
        rew = 1 if random.uniform(0, 1) < self.probs[action_index] else 0
        
        self._update_reward_estimates(action_index, rew)
        self.freqs[action_index] += 1

        return rew

class Q1:
    def __init__(self, log_freq=100, log=False):
        self.log = log
        self.log_freq = log_freq

        self._init_tb()
    
    def _init_tb(self):
        self.tb = KBandit(10)

        self.current_reward = 0
        self.rewards = []

        self.num_optimal_actions = 0
        self.optimal_percs = []

    def _ucb_formula(self, t, qt, nt, C):
        ln_t = math.log(t + np.e)
        nt_a = nt + 1

        return qt + C * math.log(ln_t /nt_a, 2)

    def _ucb_action(self, t, qts, nts, C):
        estimated_rewards = [self._ucb_formula(t, qt, nt, C) for qt, nt in zip(qts, nts)]

        self.optimal_percs.append(self.num_optimal_actions * 100/ (t+1))
        
        if self.log and (t + 1) % self.log_freq == 0 :
            print(f'Step: {t + 1}, average reward:  {self.optimal_percs[-1] / 100}')

        return np.argmax(estimated_rewards)
        

    def run(self, episodes, exploration_coef=0.1, log=False):

        self._init_tb()
        
        for i in range(episodes):
            action_index = self._ucb_action(i, self.tb.reward_list, self.tb.freqs, exploration_coef)
            new_rew = self.tb.action_reward(action_index)

            self.current_reward += new_rew

            if log and (i % self.log_freq == 0 or i == episodes - 1): 
                print(f'Episode {i}: %optimal: {self.optimal_percs[-1]}, avg reward: {self.current_reward / (i+1)}')
            
            if np.argmax(self.tb.probs) == action_index: self.num_optimal_actions += 1

            self.rewards.append(self.current_reward / (i + 1))

####### DEMONSTRATION

CONFIDENCE_COEFFICIENTS = [0, 0.01, 0.05, 0.1, 0.25]
STEPS = 5000
RUNS = 1000

def fetch_optimal_metric(coefs, steps):
    plots = []
    rewards = []
    q = Q1()

    for C in coefs:
        q.run(steps, C)
        rewards.append(q.rewards)
        plots.append(q.optimal_percs)

    return plots, rewards

def plot_diff_coefs(plots, yLabel, lim=-1):
    for P in plots:
        plt.plot(P[:lim])

    plt.legend([f'c = {C}' for C in CONFIDENCE_COEFFICIENTS], loc='lower right')
    plt.xlabel('Steps')
    plt.ylabel(f'{yLabel}')
    plt.show()

def multiple_runs(n=10):
    SUM_OPTIMAL = np.zeros((len(CONFIDENCE_COEFFICIENTS), STEPS))
    SUM_REWARDS = np.zeros((len(CONFIDENCE_COEFFICIENTS), STEPS))
    for i in range(n):
        optimals, rewards = fetch_optimal_metric(CONFIDENCE_COEFFICIENTS, STEPS)
        SUM_OPTIMAL += np.array(optimals)
        SUM_REWARDS += np.array(rewards)

    return SUM_OPTIMAL / n, SUM_REWARDS / n

## Runs and logs for one run
q = Q1()
q.run(5000, 0.1, True)

# Graphs the 50 runs
opts, rews = multiple_runs(RUNS)
plot_diff_coefs(opts, '% Optimal Actions')
plot_diff_coefs(rews, 'Average Rewards')