2 BERNOULLI BANDITS FIXED POLICY

WIN STAY LOOSE SHIFT (WSLS) benchmark problem.

Controller size=2

In [2]:
import numpy as np

dimS=2
dimA=2
dimQ=2
dimO=2
p=0.8


states=np.arange(0,dimS,dtype=int)
actions=np.arange(0,dimA,dtype=int)
nodes=np.arange(0,dimQ,dtype=int)
obs=np.arange(0,dimO,dtype=int)


# p(s'|sa)
env_prob=np.zeros((dimS,dimA,dimS))

for stateprime in states:
    for action in actions:
        for state in states:
            if (stateprime == state): env_prob[stateprime,state,action]=1


# p(a|q) Here's the policty of the WSLS strategy
policy=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        if (action == node): policy[action,node]=1

#f(o|s'a)
obs_prob=np.zeros((dimO,dimS,dimA))

for stateprime in states:
    for o in obs:
        for action in actions:
            if (stateprime==action): 
                obs_prob[o,stateprime,action]=p**o*(1.0-p)**(1-o)
            else:
                obs_prob[o,stateprime,action]=(1.0-p)**o*p**(1.0-o)

#g(q'|qao) Here's the memory update of the WSLS strategy
update=np.zeros((dimQ,dimA,dimQ,dimO))

for nodeprime in nodes:
    for node in nodes:
        for action in actions:
            for o in obs:
                if (o==1 and node==nodeprime): update[nodeprime,action,node,o]=1
                if (o==0 and node!=nodeprime): update[nodeprime,action,node,o]=1
                
#r(sa) Average reward
rew=np.zeros((dimS,dimA))

rew[0,0]=p
rew[0,1]=1-p
rew[1,0]=1-p
rew[1,1]=p


Since this is a benchmark problem the policy and the memory update have been fixed. The Bellman quaratic constraint boils down to be a linear system. The solution of this system is the value. 

In [3]:
#b(sq)
init_node=0
gamma=0.5
b=np.matmul(rew,policy)

#Bellman equation linear constraint
T=np.zeros((dimQ,dimS,dimQ,dimS))
Id=np.zeros((dimQ,dimS,dimQ,dimS))

#T(sq|s'q')=sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))
for node in nodes:
    for state in states:
        for nodeprime in nodes:
            for stateprime in states:
                for o in obs:
                    for action in actions:
                        T[node,state,nodeprime,stateprime]+=update[nodeprime,action,node,o]*policy[action,node]*env_prob[stateprime,state,action]*obs_prob[o,stateprime,action]

#Id=delta(ss',qq')
for nodeprime in nodes:
    for stateprime in states:
        for node in nodes:
            for state in states:
                if (state==stateprime and node==nodeprime): Id[node,state,nodeprime,stateprime]=1

#delta(ss',qq')- gamma*[sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))]
Btens=Id-gamma*T

#Solve the linear system to get the single value solution
v=np.linalg.tensorsolve(Btens,b)
print("Value:",v)
print("=============")

#Calculation of the objective function with fixed inotial node
print("Objective function:", 0.5*np.sum(v[init_node,:]))

#Def normalized x(q',a,q,o)
x=np.zeros((dimQ,dimA,dimQ,dimO))
for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                x[nodeprime,action,node,o]=update[nodeprime,action,node,o]*policy[action,node]

print(x[1,1,1,0],x[1,1,1,1])



Value: [[1.48 0.88]
 [0.88 1.48]]
Objective function: 1.18
0.0 1.0


In the following section the same calculations are carried out. In this case rank two arrays have been used to evalute the Bellman equation. 


The "Bellman matrix" (Bmat) is block diagonal, which makes sense because the transition probability from one stae to the other is zero.

In [18]:
from f2py_jit import jit
import numpy as np
f90=jit("SA_QCLP_ut_02_cor1.f90",flags="-O2 -fcheck=bounds")

#initial value for the x(q'aqo)
x_init=x
#initial value for the y(qs)
y_init=v
#Initial value of objective function
val_init=0.5*np.sum(y_init[init_node,:])

print("Initial mean value   >>>>>>>>>>")
print(val_init)
print("Initial policy       >>>>>>>>>>")
print(policy)

#Initial parameters
T0=50
T_end=1e-6
Temp_factor=0.6
nstep=1000
pen_step=200
N=1
stdev=1.0

for p in range(N):

    sigma=2.0
    mu=2.0

    #x(q'aqo) sample size and generation
    size_gauss_x=np.round(dimO*nstep*(1+np.log(T_end/T0)/np.log(Temp_factor))).astype(int)
   
    #Generarion of gauss sample. In this way the point generated is independent from o
    x_gauss=np.random.normal(0.0,stdev,size=(dimQ,dimA,dimQ,pen_step*size_gauss_x))        
    #Generating the effective gauss_sample                 

    #y(qs) sample size and generation
    size_gauss_y=np.round(dimS*nstep*(1+np.log(T_end/T0)/np.log(Temp_factor))).astype(int)
    y_gauss=np.random.normal(0.0,stdev,size=(dimQ,pen_step*size_gauss_y))

    #Simulated Annealing 
    Res=f90.qclp.drive(T0,T_end,Temp_factor,nstep,pen_step,x_init,y_init,val_init,rew,env_prob,obs_prob,sigma,mu,x_gauss,y_gauss,gamma)

x_min=Res[0]
y_min=Res[1]
print("value                >>>>>>>>>>>")
print(y_min)
print("mean value           >>>>>>>>>>>")
print(0.5*np.sum(y_min[init_node,:]))
print("objective function   >>>>>>>>>>>")
print(Res[2])


Initial mean value   >>>>>>>>>>
1.18
Initial policy       >>>>>>>>>>
[[1. 0.]
 [0. 1.]]
value                >>>>>>>>>>>
[[1.625197   0.99755214]
 [0.93697533 1.42047704]]
mean value           >>>>>>>>>>>
1.3113745684523215
objective function   >>>>>>>>>>>
-1.1824918870964816


In [20]:
print(y_min)
Pol0=np.zeros((dimA,dimQ))
Pol1=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        for nodeprime in nodes:
            Pol0[action,node]=np.sum(x_min[:,action,node,0])

for action in actions:
    for node in nodes:
        Pol1[action,node]=np.sum(x_min[:,action,node,1])


print(Pol0)
print(Pol1)

normalization=np.zeros((dimQ,dimO))

for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                normalization[node,o]=np.sum(x_min[:,:,node,o])

print("norm",normalization)

[[1.625197   0.99755214]
 [0.93697533 1.42047704]]
[[0.90703248 0.07559685]
 [0.09296752 0.92440315]]
[[0.91422573 0.07269337]
 [0.08577427 0.92730663]]
norm [[1. 1.]
 [1. 1.]]


In [2]:
import numpy as np

dimS=2
dimA=2
dimQ=3
dimO=2
p=0.8


states=np.arange(0,dimS,dtype=int)
actions=np.arange(0,dimA,dtype=int)
nodes=np.arange(0,dimQ,dtype=int)
obs=np.arange(0,dimO,dtype=int)


# p(s'|sa)
env_prob=np.zeros((dimS,dimA,dimS))

for stateprime in states:
    for action in actions:
        for state in states:
            if (stateprime == state): env_prob[stateprime,state,action]=1


# p(a|q) Here's the policty of the WSLS strategy
policy=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        if (action == node):
            policy[action,node]=1
        elif (node==2):
            policy[action,node]=0.5
        else:
            policy[action,node]=0

#f(o|s'a)
obs_prob=np.zeros((dimO,dimS,dimA))

for stateprime in states:
    for o in obs:
        for action in actions:
            if (stateprime==action): 
                obs_prob[o,stateprime,action]=p**o*(1.0-p)**(1-o)
            else:
                obs_prob[o,stateprime,action]=(1.0-p)**o*p**(1.0-o)

#g(q'|qao) Here's the memory update of the WSLS strategy
update=np.zeros((dimQ,dimA,dimQ,dimO))

for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                # if we win in the node corresponding to action, we stay
                if nodeprime == node and node == action and o == 1:
                    update[nodeprime,action,node,o] = 1
                # if we lose in the node corresponding to action, we shift to the intermediate node
                elif nodeprime == 2 and node == action and o == 0:
                    update[nodeprime,action,node,o] = 1
                # if we win in the intermediate node, we shift to node corresponding to action taken
                elif nodeprime == action and node == 2 and o == 1:
                    update[nodeprime,action,node,o] = 1 # P(a | 2) = 0.5
                # if we lose in the intermediate node, we shift to node corresponding to the other action
                elif nodeprime!= action and node == 2 and nodeprime != 2 and o == 0:
                    update[nodeprime,action,node,o] = 1 # P(a | 2) = 0.5
                # the rest is 0
                else:
                    update[nodeprime,action,node,o] = 0
                
#r(sa) Average reward
rew=np.zeros((dimS,dimA))

rew[0,0]=p
rew[0,1]=1-p
rew[1,0]=1-p
rew[1,1]=p


In [3]:
#b(sq)
init_node=0
gamma=0.5
b=np.zeros((dimQ,dimS))

for node in nodes:
    for state in states:
        for action in actions:
            b[node,state]+=rew[state,action]*policy[action,node]


#Bellman equation linear constraint
T=np.zeros((dimQ,dimS,dimQ,dimS))
Id=np.zeros((dimQ,dimS,dimQ,dimS))

#T(qs|q's')=sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))
for node in nodes:
    for state in states:
        for nodeprime in nodes:
            for stateprime in states:
                for o in obs:
                    for action in actions:
                        T[node,state,nodeprime,stateprime]+=update[nodeprime,action,node,o]*policy[action,node]*env_prob[stateprime,state,action]*obs_prob[o,stateprime,action]

#Id=delta(ss',qq')
for nodeprime in nodes:
    for stateprime in states:
        for node in nodes:
            for state in states:
                if (state==stateprime and node==nodeprime): Id[node,state,nodeprime,stateprime]=1

#delta(ss',qq')- gamma*[sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))]
Btens=Id-gamma*T

#Solve the linear system to get the single value solution
v=np.linalg.tensorsolve(Btens,b)
print("Value:",v)
print("=============")

#Calculation of the objective function with fixed inotial node
print("Objective function:", 0.5*np.sum(v[init_node,:]))

#Def normalized x(q',a,q,o)
x=np.zeros((dimQ,dimA,dimQ,dimO))
for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                x[nodeprime,action,node,o]=update[nodeprime,action,node,o]*policy[action,node]

Value: [[1.53125 0.75   ]
 [0.75    1.53125]
 [1.1875  1.1875 ]]
Objective function: 1.140625


In [5]:
import numpy as np
from f2py_jit import jit
f90=jit("SA_QCLP_ut_02_cor1.f90",flags="-O3 -fcheck=bounds")

#initial value for the x(q'aqo)
x_init=x
#initial value for the y(qs)
y_init=v
#Initial value of objective function
val_init=0.5*np.sum(y_init[init_node,:])

print("Initial mean value   >>>>>>>>>>")
print(val_init)
print("Initial policy       >>>>>>>>>>")
print(policy)

#Initial parameters
T0=10
T_end=1e-6
Temp_factor=0.6
factor=0.3
nstep=500
pen_step=1000
N=1
stdev=1.0

for p in range(N):

    sigma=2.0
    mu=2.0

    #x(q'aqo) sample size and generation
    size_gauss_x=np.round(dimO*nstep*(1+np.log(T_end/T0)/np.log(Temp_factor))).astype(int)

    #Generarion of gauss sample. In this way the point generated is independent from o
    x_gauss=np.random.normal(0.0,stdev,size=(dimQ,dimA,dimQ,pen_step*size_gauss_x))        
    #Generating the effective gauss_sample                 

    #y(qs) sample size and generation
    size_gauss_y=np.round(dimS*nstep*(1+np.log(T_end/T0)/np.log(Temp_factor))).astype(int)
    y_gauss=np.random.normal(0.0,stdev,size=(dimQ,pen_step*size_gauss_y))

    #Simulated Annealing 
    Res=f90.qclp.drive(T0,T_end,Temp_factor,nstep,pen_step,x_init,y_init,val_init,rew,env_prob,obs_prob,sigma,mu,x_gauss,y_gauss,gamma)


    
x_min=Res[0]
y_min=Res[1]
print("value                >>>>>>>>>>>")
print(y_min)
print("mean value           >>>>>>>>>>>")
print(0.5*np.sum(y_min[init_node,:]))
print("objective function   >>>>>>>>>>>")
print(Res[2])


Initial mean value   >>>>>>>>>>
1.140625
Initial policy       >>>>>>>>>>
[[1.  0.  0.5]
 [0.  1.  0.5]]
value                >>>>>>>>>>>
[[1.45055557 0.90821086]
 [0.6444188  1.55060089]
 [0.71110304 1.53534958]]
mean value           >>>>>>>>>>>
1.1793832137219378
objective function   >>>>>>>>>>>
-1.1782832562209606


In [29]:
print(y_min)
Pol0=np.zeros((dimA,dimQ))
Pol1=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        for nodeprime in nodes:
            Pol0[action,node]=np.sum(x_min[:,action,node,0])

for action in actions:
    for node in nodes:
        Pol1[action,node]=np.sum(x_min[:,action,node,1])


print(Pol0)
print(Pol1)

normalization=np.zeros((dimQ,dimO))

for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                normalization[node,o]=np.sum(x_min[:,:,node,o])

print("norm",normalization)

[[1.46144802 0.90041418]
 [0.70939496 1.53451892]
 [0.80898385 1.51079012]]
[[0.99871362 0.00452143 0.00213783]
 [0.00128638 0.99547857 0.99786217]]
[[0.99874143 0.00446269 0.00213568]
 [0.00125857 0.99553731 0.99786432]]
norm [[1. 1.]
 [1. 1.]
 [1. 1.]]
