# 1. N-Armed Bandit

## 1.1 All $Q_{ai}$ are initialized to 0
## 1.2 The rewards are subject to noise according to a normal distribution with mean $Q^∗_{ai}$ and standard deviation $σ_i$ , ∀i = 1, . . . , N .

For this exercise you will have to implement the N-Armed bandit problem using each of the following
action selection methods:

* Random
* $\epsilon$-greedy with parameter $\epsilon$
* Softmax with parameter $\tau$

In [6]:
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import copy
%matplotlib widget

class Action:
    def __init__(self, name, qStar=1,qSigma=1,qEst=0):
        self.name=name
        self.qEst=[qEst]
        self.qStar=qStar
        self.qSigma=qSigma
    def generateReward(self):
        return np.random.normal(loc=self.qStar, scale=self.qSigma)
    def updateQEst(self, action, reward, alfa=0.01):
        # Qvalue estimation of action only gets updated when the executed actions equals the name of this actions 
        if action == self.name:
            self.qEst.append(self.qEst[-1] + alfa*(reward - self.qEst[-1]))
        # Otherwise Qvalue estimation stays the same as the previous round
        else:
            self.qEst.append(self.qEst[-1])
    def plotActionQEvolution(self):
        plt.plot(self.qEst, label=self.name)
        plt.legend(loc="lower right")
        plt.xlabel("Rounds")
        plt.ylabel("Qai estimation")
        plt.title("Evolution of Qai estimation for actions: "+self.name)
  
class Selection:
    def __init__(self, name, actionDescriptions, methodDescription, rounds=1e4, alfa=1e-2):
        self.name = name
        self.actions= {actionName:Action(actionName, **actionDescriptions[actionName]) for actionName in actionDescriptions.keys()}
        self.methodDescription= methodDescription
        self.rewards=[0]
        for i in range(int(rounds)):
            self.updateActions(alfa=alfa)
    def selectAction(self):
        selectionMethod=self.methodDescription['method']
        methodArguments={arg:self.methodDescription[arg] for arg in self.methodDescription.keys() if arg != "method"}
        actions=list(self.actions.values())
        return selectionMethod(actions, **methodArguments).name
    def updateActions(self, alfa=0.01):
        actionName = self.selectAction()
        reward = self.actions[actionName].generateReward()
        self.rewards.append(reward)
        for action in list(self.actions.values()):
            action.updateQEst(actionName, reward, alfa=alfa)
    def getCumAverageRewards(self):
        subtotal = 0
        result=list()
        for i in range(len(self.rewards)):
            subtotal+=self.rewards[i]
            result.append(subtotal/(i+1))
        return result
    def plotCumAverageRewards(self):
        plt.plot(self.getCumAverageRewards(), label=self.name)
        plt.legend(loc="lower right")
        plt.xlabel("Rounds")
        plt.ylabel("Cumulated average of the rewards")
        plt.title("Evolution of the Cumulated average of the rewards over time:"+self.name)
    def plotActionQEvolution(self, actionName):
        self.actions[actionName].plotActionQEvolution()
    def plotActionQEvolutions(self):
        for actionName in self.actions:
            self.plotActionQEvolution(actionName)
        plt.title(self.name)
        plt.title("Evolution of Qai estimation for different actions: "+self.name)
        
class SelectionReplicated:
    def __init__(self, name, actionDescriptions, methodDescription, rounds=1e4, alfa=1e-2, replicates=2e1):
        self.name = name
        self.replicates=list()
        for i in tqdm(range(int(replicates))):
            self.replicates.append(Selection(name, actionDescriptions, methodDescription, rounds=rounds, alfa=alfa))
    def plotCumAverageRewards(self):
        cumAverageRewards_averagedOverReplicates=np.sum([selection.getCumAverageRewards() for selection in self.replicates], axis=0)/len(self.replicates)
        plt.plot(cumAverageRewards_averagedOverReplicates, label=self.name)
        plt.legend(loc="lower right")
        plt.xlabel("Rounds")
        plt.ylabel("Cumulated average of the rewards")
        plt.title("Evolution of the Cumulated average of the rewards over time:"+self.name)
    def plotActionQEvolution(self, actionName):
        qAverageOverReplicates = np.sum([selection.actions[actionName].qEst for selection in self.replicates],axis=0)/len(self.replicates)
        plt.plot(qAverageOverReplicates, label=self.name)
        plt.legend(loc="lower right")
        plt.xlabel("Rounds")
        plt.ylabel("Qai estimation")
        plt.title("Evolution of Qai estimation for different actions" +self.name )
    def plotActionQEvolutions(self):
        for actionName in self.replicates[0].actions:
            self.plotActionQEvolution(actionName)
        plt.legend(loc="lower right")
        plt.xlabel("Rounds")
        plt.ylabel("Qai estimation")
        plt.title("Evolution of Qai estimation for different actions: " +self.name )
    def plotHistogram(self):
        actions_n = list()
        labels= list()
        for actionName in self.replicates[0].actions:
            actions_n.append([len(np.unique(rep.actions[actionName].qEst))-1 for rep in self.replicates])
            labels.append(actionName)
        plt.figure()
        plt.hist(actions_n, alpha=1, label=labels, rwidth=1)
        plt.xlabel("Times arm is selected")
        plt.ylabel("Count over different replications")
        plt.title("Histogram number of times arm is selected: "+ self.name)
        plt.legend(loc='upper right')
        plt.savefig("hist_"+self.name+".png", format="png", dpi=150)
        plt.show()
    
class SelectionComparison:
    def __init__(self, name, actionDescriptions, methodDescriptions, rounds=1e4, alfa=1e-2, replicates=2e1):
        self.name=name
        self.selectionMethods = {selectionMethod:SelectionReplicated(selectionMethod, 
                                                                     actionDescriptions, 
                                                                     methodDescriptions[selectionMethod], 
                                                                     rounds=rounds, 
                                                                     alfa=alfa, 
                                                                     replicates=replicates) 
                                 for selectionMethod in methodDescriptions.keys()}
    def plotCumAverageRewards(self):
        plt.figure()
        for selectionMethod in self.selectionMethods.values():
            selectionMethod.plotCumAverageRewards()
        plt.title("Comparison of the Cumulated average of the rewards over time.")
        plt.savefig("CumAverageReward.png", format="png", dpi=150)
        plt.show()
    def plotActionQEvolution(self, actionName):
        plt.figure()
        qStar = list(self.selectionMethods.values())[0].replicates[0].actions[actionName].qStar
        
        plt.axhline(y=qStar, label="Qai*="+str(qStar), color="black", linestyle="--")
        for selectionMethod in self.selectionMethods.values():
            selectionMethod.plotActionQEvolution(actionName)
        plt.title("Comparison of Qa estimate over time: "+ actionName)
        plt.savefig("actionEvolution_"+actionName+".png", format="png", dpi=150)
        plt.show()
    def plotActionQEvolutions(self):
        actionNames = list(self.selectionMethods.values())[0].replicates[0].actions
        for actionName in actionNames:
            self.plotActionQEvolution(actionName)
    def plotHistogram(self):
        for strategy in self.selectionMethods.values():
            strategy.plotHistogram()
    
    
def selectRandom(actions):
    return np.random.choice(actions)

def selectGreedy(actions, e=1):
    if np.random.random() < e:
        return selectRandom(actions)
    else:
        return sorted(actions, key= lambda x:x.qEst[-1], reverse=True)[0]
    
def selectSoftmax(actions, tau=1e6):   
    def term(action):
        return np.exp(action.qEst[-1]/tau)
    terms = np.array([term(action) for action in actions])
    cumterms = np.sum(terms)
    p_actions = terms/cumterms
    return np.random.choice(actions, p=p_actions)    



In [8]:
%matplotlib widget

qEst0=0    
actionsDescriptions={
    "Arm 1":{"qStar":2.4, "qSigma":0.9, "qEst":qEst0},
    "Arm 2":{"qStar":1.3, "qSigma":2.0, "qEst":qEst0},
    "Arm 3":{"qStar":1.0, "qSigma":0.4, "qEst":qEst0},
    "Arm 4":{"qStar":2.2, "qSigma":0.6, "qEst":qEst0}
}

methodDescriptions={
    "Random":{"method":selectRandom},
    "Greedy e=0.0":{"method":selectGreedy, "e":0},
    "Greedy e=0.1":{"method":selectGreedy, "e":0.1},
    "Greedy e=0.2":{"method":selectGreedy, "e":0.2},
    "SoftMax tau=1.0":{"method":selectSoftmax, "tau":1},
    "SoftMax tau=0.1":{"method":selectSoftmax, "tau":0.1},
}

selections = SelectionComparison("Comparison", actionsDescriptions, methodDescriptions, rounds=1e3, alfa=1e-1, replicates=1e2)
selections.plotCumAverageRewards()
selections.plotActionQEvolutions()
selections.plotHistogram()


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
%matplotlib widget

qEst0=0    
m = 2 #multiplication
actionsDescriptions={
    "Arm 1":{"qStar":2.4, "qSigma":0.9*m, "qEst":qEst0},
    "Arm 2":{"qStar":1.3, "qSigma":2.0*m, "qEst":qEst0},
    "Arm 3":{"qStar":1.0, "qSigma":0.4*m, "qEst":qEst0},
    "Arm 4":{"qStar":2.2, "qSigma":0.6*m, "qEst":qEst0}
}

methodDescriptions={
    "Random":{"method":selectRandom},
    "Greedy e=0.0":{"method":selectGreedy, "e":0},
    "Greedy e=0.1":{"method":selectGreedy, "e":0.1},
    "Greedy e=0.2":{"method":selectGreedy, "e":0.2},
    "SoftMax tau=1.0":{"method":selectSoftmax, "tau":1},
    "SoftMax tau=0.1":{"method":selectSoftmax, "tau":0.1},
}

selections = SelectionComparison("Comparison", actionsDescriptions, methodDescriptions, rounds=1e3, alfa=1e-1, replicates=1e2)
selections.plotCumAverageRewards()
selections.plotActionQEvolutions()
selections.plotHistogram()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
%matplotlib widget

def selectGreedyDecreasing(actions, power=1.0/2.0):
    t = len(actions[0].qEst)
    e = 1/(t**power)
    if np.random.random() < e:
        return selectRandom(actions)
    else:
        return sorted(actions, key= lambda x:x.qEst[-1], reverse=True)[0]

qEst0=0    
m = 1 #multiplication
actionsDescriptions={
    "Arm 1":{"qStar":2.4, "qSigma":0.9*m, "qEst":qEst0},
    "Arm 2":{"qStar":1.3, "qSigma":2.0*m, "qEst":qEst0},
    "Arm 3":{"qStar":1.0, "qSigma":0.4*m, "qEst":qEst0},
    "Arm 4":{"qStar":2.2, "qSigma":0.6*m, "qEst":qEst0}
}

methodDescriptions={
    "Greedy decreasing":{"method":selectGreedyDecreasing, "power":(1.0/2.0)},
    "Greedy e=0.0":{"method":selectGreedy, "e":0},
    "Greedy e=0.1":{"method":selectGreedy, "e":0.1},
    "Greedy e=0.2":{"method":selectGreedy, "e":0.2},
}

selections = SelectionComparison("Comparison", actionsDescriptions, methodDescriptions, rounds=1e3, alfa=1e-1, replicates=1e2)
selections.plotCumAverageRewards()
selections.plotActionQEvolutions()
selections.plotHistogram()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
%matplotlib widget

def selectSoftmaxDecreasing(actions, maxT=1e3, maxTau=4.0, minTau=1e-2):
    def term(action):
        t = len(actions[0].qEst)
        tau = minTau+ maxTau*(maxT -t)/(maxT)
        return np.exp(action.qEst[-1]/tau)
    terms = np.array([term(action) for action in actions])
    cumterms = np.sum(terms)
    p_actions = terms/cumterms
    return np.random.choice(actions, p=p_actions)    

rounds=1e3
qEst0=0    
m = 1 #multiplication
actionsDescriptions={
    "Arm 1":{"qStar":2.4, "qSigma":0.9*m, "qEst":qEst0},
    "Arm 2":{"qStar":1.3, "qSigma":2.0*m, "qEst":qEst0},
    "Arm 3":{"qStar":1.0, "qSigma":0.4*m, "qEst":qEst0},
    "Arm 4":{"qStar":2.2, "qSigma":0.6*m, "qEst":qEst0}
}

methodDescriptions={
    "SoftMax Decreasing":{"method":selectSoftmaxDecreasing, "maxT":rounds, "maxTau":4.0},
    "SoftMax tau=1.0":{"method":selectSoftmax, "tau":1},
    "SoftMax tau=0.1":{"method":selectSoftmax, "tau":0.1},
}

selections = SelectionComparison("Comparison", actionsDescriptions, methodDescriptions, rounds=rounds, alfa=1e-1, replicates=1e2)
selections.plotCumAverageRewards()
selections.plotActionQEvolutions()
selections.plotHistogram()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 2. Stochastic Reward Game

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
%matplotlib widget

class Game:
    def __init__(self, gameRewards, rounds=5e3, initialValue=0):
        self.gameRewards=gameRewards
        self.agents=self.generateAgents(initialValue=[initialValue])
        self.rewards=[0]
        for i in range(int(rounds)):
            self.main()
    def getReward(self, actionA, actionB):
        jointAction = tuple(sorted([actionA, actionB]))
        state=self.gameRewards[jointAction]
        rewardFunction=state["rewardFunction"]
        params = {key: value for key, value in state.items() if key != "rewardFunction"} 
        return rewardFunction(**params)
    def generateAgents(self, initialValue=[0,]):
        nAgents = len(list(self.gameRewards)[0])
        Agents = list()
        for i in range(nAgents):
            rewardBelieves = {key:initialValue[:] for key in self.gameRewards.keys()} 
            actions = sorted(np.unique([key[i] for key in self.gameRewards.keys()]))
            game= self
            Agents.append(Agent(actions, rewardBelieves, game))
        for agent in Agents:
            agent.generateInitialActionBelieves(list(set(Agents)-{agent}))
        return Agents
    def updateActionBelieves(self, agentActions):
        for agent in self.agents:
            agent.updateActionBelieves(agentActions)
    def updateRewardBelieves(self, agentActions, reward):
        for agent in self.agents:
            agent.updateRewardBelieves(agentActions, reward)
    def getCumAverageRewards(self):
        subtotal = 0
        result=list()
        for i in range(len(self.rewards)):
            subtotal+=self.rewards[i]
            result.append(subtotal/(i+1))
        return result
    def main(self):
        agentActions = {agent:agent.chooseAction() for agent in self.agents}
        self.updateActionBelieves(agentActions)
        jointAction = tuple(sorted(agentActions.values()))
        self.rewards.append(self.getReward(*jointAction))
        reward = self.rewards[-1]
        self.updateRewardBelieves(agentActions, reward)
        
    
class Agent:
    def __init__(self, actions, initialBelievesReward, game):
        self.actions=actions
        self.rewardBelieves=initialBelievesReward
        self.game=game
    def generateInitialActionBelieves(self, otherAgents):
        self.actionBelieves=dict()
        for agent in otherAgents:
            self.actionBelieves[agent]=list((np.random.choice(agent.actions, size=int(2e1))))
    def updateActionBelieves(self, agentActions):
        for agent in list(agentActions):
            if agent != self:
                self.actionBelieves[agent]=[agentActions[agent]]+self.actionBelieves[agent][:-1]
    def updateRewardBelieves(self, agentActions, reward, alfa=0.1):
        jointAction=tuple(sorted(agentActions.values()))
        for key in list(self.rewardBelieves):
            lastBelieve=self.rewardBelieves[key][-1]
            if jointAction == key:
                #reward=game.getReward(*jointAction)
                newBelieve= lastBelieve + alfa*(reward - lastBelieve)
                self.rewardBelieves[key].append(newBelieve)
            else:
                self.rewardBelieves[key].append(lastBelieve)
    def otherAgents(self):
        return list(set(game.agents)-{self})
    def actionBelieves2prob(self):
        # Bad code, does only work because there are just two agents
        for agent in self.actionBelieves.keys():
            actionss = self.actionBelieves[agent] + agent.actions
            actions, counts = np.unique(actionss, return_counts=True)
            probs = (counts-1) / float(len(self.actionBelieves[agent]))
            return list(zip(actions, probs))
    def predictRewards(self):
        actionProbs = self.actionBelieves2prob()
        probs = np.array([value[1] for value in actionProbs])
        r=list()
        actions = list()
        for action in self.actions:
            combinations = sorted([key for key in self.rewardBelieves.keys() if action in key])
            rewards = np.array([self.rewardBelieves[comb][-1] for comb in combinations])
            r.append(np.average(rewards*probs))
            actions.append(action)
        return r,actions
    def strategyBoltzmann(self, tau=1):
        def term(reward):
            return np.exp(reward/tau)
        rewards, actions = self.predictRewards()
        terms = np.array([ term(reward) for reward in rewards])
        cumterms = np.sum(terms)
        p_actions = terms/cumterms
        return p_actions
    def chooseAction(self):
        return np.random.choice(self.actions, p=self.strategyBoltzmann())

In [15]:
%matplotlib widget

rounds=50

sigma0= 0.2
sigma1= 0.2
sigma= 0.2
gameRewards = {
    ('a1','b1'):{"rewardFunction":np.random.normal, "scale":sigma0, "loc":11},
    ('a1','b2'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":-30},
    ('a1','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a2','b1'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":-30},
    ('a2','b2'):{"rewardFunction":np.random.normal, "scale":sigma1, "loc":7},
    ('a2','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":6},
    ('a3','b1'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a3','b2'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a3','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":5},
}

Rewards=list()
OptimisticRewards=list()

for i in range(rounds):
    game = Game(gameRewards, initialValue=0)
    Rewards.append(game.getCumAverageRewards())
    gameOptimistic = Game(gameRewards, initialValue=50)
    OptimisticRewards.append(gameOptimistic.getCumAverageRewards())

plt.figure()
plt.plot(np.average(Rewards,axis=0), label = "Boltzmann (tau=1)")
plt.plot(np.average(OptimisticRewards,axis=0), label = "Optimistic Boltzmann (tau=1)")
plt.xlabel("Episode")
plt.ylabel("Average collected reward per episode")
plt.legend(loc="lower right")
plt.title("sigma0="+str(sigma0)+" sigma1="+str(sigma1)+" sigma="+str(sigma))
plt.savefig("ex2/1.png", format="png", dpi=150)
plt.show()

sigma0= 4
sigma1= 0.1
sigma= 0.1
gameRewards = {
    ('a1','b1'):{"rewardFunction":np.random.normal, "scale":sigma0, "loc":11},
    ('a1','b2'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":-30},
    ('a1','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a2','b1'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":-30},
    ('a2','b2'):{"rewardFunction":np.random.normal, "scale":sigma1, "loc":7},
    ('a2','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":6},
    ('a3','b1'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a3','b2'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a3','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":5},
}

Rewards=list()
OptimisticRewards=list()

for i in range(rounds):
    game = Game(gameRewards, initialValue=0)
    Rewards.append(game.getCumAverageRewards())
    gameOptimistic = Game(gameRewards, initialValue=50)
    OptimisticRewards.append(gameOptimistic.getCumAverageRewards())

plt.figure()
plt.plot(np.average(Rewards,axis=0), label = "Boltzmann (tau=1)")
plt.plot(np.average(OptimisticRewards,axis=0), label = "Optimistic Boltzmann (tau=1)")
plt.xlabel("Episode")
plt.ylabel("Average collected reward per episode")
plt.legend(loc="lower right")
plt.title("sigma0="+str(sigma0)+" sigma1="+str(sigma1)+" sigma="+str(sigma))
plt.savefig("ex2/2.png", format="png", dpi=150)
plt.show()

sigma0= 0.1
sigma1= 4
sigma= 0.1
gameRewards = {
    ('a1','b1'):{"rewardFunction":np.random.normal, "scale":sigma0, "loc":11},
    ('a1','b2'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":-30},
    ('a1','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a2','b1'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":-30},
    ('a2','b2'):{"rewardFunction":np.random.normal, "scale":sigma1, "loc":7},
    ('a2','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":6},
    ('a3','b1'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a3','b2'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":0},
    ('a3','b3'):{"rewardFunction":np.random.normal, "scale":sigma, "loc":5},
}

Rewards=list()
OptimisticRewards=list()

for i in range(rounds):
    game = Game(gameRewards, initialValue=0)
    Rewards.append(game.getCumAverageRewards())
    gameOptimistic = Game(gameRewards, initialValue=50)
    OptimisticRewards.append(gameOptimistic.getCumAverageRewards())

plt.figure()
plt.plot(np.average(Rewards,axis=0), label = "Boltzmann (tau=1)")
plt.plot(np.average(OptimisticRewards,axis=0), label = "Optimistic Boltzmann (tau=1)")
plt.xlabel("Episode")
plt.ylabel("Average collected reward per episode")
plt.legend(loc="lower right")
plt.title("sigma0="+str(sigma0)+" sigma1="+str(sigma1)+" sigma="+str(sigma))
plt.savefig("ex2/3.png", format="png", dpi=150)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …