2 BERNOULLI BANDITS FIXED POLICY

WIN STAY LOOSE SHIFT (WSLS) benchmark problem.

Controller size=2

In [18]:
import numpy as np

dimS=2
dimA=2
dimQ=2
dimO=2
p=0.8


states=np.arange(0,dimS,dtype=int)
actions=np.arange(0,dimA,dtype=int)
nodes=np.arange(0,dimQ,dtype=int)
obs=np.arange(0,dimO,dtype=int)


# p(s'|sa)
env_prob=np.zeros((dimS,dimA,dimS))

for stateprime in states:
    for action in actions:
        for state in states:
            if (stateprime == state): env_prob[stateprime,state,action]=1


# p(a|q) Here's the policty of the WSLS strategy
policy=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        if (action == node): policy[action,node]=1

#f(o|s'a)
obs_prob=np.zeros((dimO,dimS,dimA))

for stateprime in states:
    for o in obs:
        for action in actions:
            if (stateprime==action): 
                obs_prob[o,stateprime,action]=p**o*(1.0-p)**(1-o)
            else:
                obs_prob[o,stateprime,action]=(1.0-p)**o*p**(1.0-o)

#g(q'|qao) Here's the memory update of the WSLS strategy
update=np.zeros((dimQ,dimA,dimQ,dimO))

for nodeprime in nodes:
    for node in nodes:
        for action in actions:
            for o in obs:
                if (o==1 and node==nodeprime): update[nodeprime,action,node,o]=1
                if (o==0 and node!=nodeprime): update[nodeprime,action,node,o]=1
                
#r(sa) Average reward
rew=np.zeros((dimS,dimA))

rew[0,0]=p
rew[0,1]=1-p
rew[1,0]=1-p
rew[1,1]=p


Since this is a benchmark problem the policy and the memory update have been fixed. The Bellman quaratic constraint boils down to be a linear system. The solution of this system is the value. 

In [19]:
#b(sq)
init_node=0
gamma=0.5
b=np.matmul(rew,policy)

#Bellman equation linear constraint
T=np.zeros((dimQ,dimS,dimQ,dimS))
Id=np.zeros((dimQ,dimS,dimQ,dimS))

#T(sq|s'q')=sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))
for node in nodes:
    for state in states:
        for nodeprime in nodes:
            for stateprime in states:
                for o in obs:
                    for action in actions:
                        T[node,state,nodeprime,stateprime]+=update[nodeprime,action,node,o]*policy[action,node]*env_prob[stateprime,state,action]*obs_prob[o,stateprime,action]

#Id=delta(ss',qq')
for nodeprime in nodes:
    for stateprime in states:
        for node in nodes:
            for state in states:
                if (state==stateprime and node==nodeprime): Id[node,state,nodeprime,stateprime]=1

#delta(ss',qq')- gamma*[sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))]
Btens=Id-gamma*T

#Solve the linear system to get the single value solution
v=np.linalg.tensorsolve(Btens,b)
print("Value:",v)
print("=============")

#Calculation of the objective function with fixed inotial node
print("Objective function:", 0.5*np.sum(v[init_node,:]))

#Def normalized x(q',a,q,o)
x=np.zeros((dimQ,dimA,dimQ,dimO))
for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                x[nodeprime,action,node,o]=update[nodeprime,action,node,o]*policy[action,node]



Value: [[1.48 0.88]
 [0.88 1.48]]
Objective function: 1.18


In the following section the same calculations are carried out. In this case rank two arrays have been used to evalute the Bellman equation. 


The "Bellman matrix" (Bmat) is block diagonal, which makes sense because the transition probability from one stae to the other is zero.

In [20]:
import numpy as np
from f2py_jit import jit
f90=jit("SA_QCLP_ut_02_cor1.f90",flags="-O3 -fcheck=bounds")

#initial value for the x(q'aqo)
x_init=x
#initial value for the y(qs)
y_init=v
#Initial value of objective function
val_init=0.5*np.sum(y_init[init_node,:])

print("Initial mean value   >>>>>>>>>>")
print(val_init)
print("Initial policy       >>>>>>>>>>")
print(policy)

#Initial parameters
T0=10
T_end=1e-6
Temp_factor=0.3
factor=0.4
nstep=50
pen_step=350
N=20
stdev=1.0

val_min=-val_init

size_gauss_x=round(dimO*nstep*(1.0-factor))
size_cauchy_x=round(dimO*nstep*(factor))

size_gauss_y=round(dimS*nstep*(1.0-factor))
size_cauchy_y=round(dimS*nstep*(factor))

rndx=np.zeros((dimQ,dimA,dimQ,pen_step*dimO*nstep*round(1+np.log(T_end/T0)/np.log(Temp_factor))))
rndy=np.zeros((dimQ,pen_step*dimS*nstep*round(1+np.log(T_end/T0)/np.log(Temp_factor))))


for p in range(N):

    sigma=2.0
    mu=2.0

    for j in range(pen_step*round(1+np.log(T_end/T0)/np.log(Temp_factor))):

        #x(q'aqo) sample size and generation

        #Generarion of gauss sample. In this way the point generated is independent from o
        x_gauss=np.random.normal(0.0,stdev,size=(dimQ,dimA,dimQ,size_gauss_x))
        x_cauchy=np.random.standard_cauchy(size=(dimQ,dimA,dimQ,size_cauchy_x))
        a=np.append(x_cauchy,x_gauss,axis=3)
        rndx[:,:,:,j*dimO*nstep:(j+1)*dimO*nstep]=a
        #Generating the effective gauss_sample                 

        #y(qs) sample size and generation

        y_cauchy=np.random.standard_cauchy(size=(dimQ,size_cauchy_y))
        y_gauss=np.random.normal(0.0,stdev,size=(dimQ,size_gauss_y))
        b=np.append(y_cauchy,y_gauss,axis=1)
        rndy[:,j*dimO*nstep:(j+1)*dimO*nstep]=b
    
    #Simulated Annealing 
    Res=f90.qclp.drive(T0,T_end,Temp_factor,nstep,pen_step,x_init,y_init,val_init,rew,env_prob,obs_prob,sigma,mu,rndx,rndy,gamma)
    
    if (Res[2] <= val_min):
        x_init=Res[0]
        x_min=Res[0]
        y_init=Res[1]
        y_min=Res[1]
        val_init=Res[2]
        val_min=Res[2]
        
    #print(p,"Gone. Objective function:", Res[2])

print("value                >>>>>>>>>>>")
print(y_min)
print("mean value           >>>>>>>>>>>")
print(0.5*np.sum(y_min[init_node,:]))
print("objective function   >>>>>>>>>>>")
print(val_min)


Initial mean value   >>>>>>>>>>
1.18
Initial policy       >>>>>>>>>>
[[1. 0.]
 [0. 1.]]


value                >>>>>>>>>>>
[[1.48 0.88]
 [0.88 1.48]]
mean value           >>>>>>>>>>>
1.18
objective function   >>>>>>>>>>>
-1.18


In [21]:
print(y_min)
Pol0=np.zeros((dimA,dimQ))
Pol1=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        for nodeprime in nodes:
            Pol0[action,node]=np.sum(x_min[:,action,node,0])

for action in actions:
    for node in nodes:
        Pol1[action,node]=np.sum(x_min[:,action,node,1])


print(Pol0)
print(Pol1)

normalization=np.zeros((dimQ,dimO))

for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                normalization[node,o]=np.sum(x_min[:,:,node,o])

print("norm",normalization)

[[1.48 0.88]
 [0.88 1.48]]
[[1. 0.]
 [0. 1.]]
[[1. 0.]
 [0. 1.]]
norm [[1. 1.]
 [1. 1.]]


In [22]:
for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                if (x_min[nodeprime,action,node,o]/Pol1[action,node] > 1e-4): 
                    print("INIT NODE:",node,">>>","ACTION:",action,">>>","OBS:",o,">>>","FINAL NODE:",nodeprime,"---->",x_min[nodeprime,action,node,o]/Pol0[action,node])
                    print(".")

INIT NODE: 0 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 1 ----> 1.0
.
INIT NODE: 0 >>> ACTION: 0 >>> OBS: 1 >>> FINAL NODE: 0 ----> 1.0
.
INIT NODE: 1 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 0 ----> 1.0
.
INIT NODE: 1 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 1 ----> 1.0
.


  if (x_min[nodeprime,action,node,o]/Pol1[action,node] > 1e-4):


Controller size=3

In [23]:
import numpy as np

dimS=2
dimA=2
dimQ=3
dimO=2
p=0.8


states=np.arange(0,dimS,dtype=int)
actions=np.arange(0,dimA,dtype=int)
nodes=np.arange(0,dimQ,dtype=int)
obs=np.arange(0,dimO,dtype=int)


# p(s'|sa)
env_prob=np.zeros((dimS,dimA,dimS))

for stateprime in states:
    for action in actions:
        for state in states:
            if (stateprime == state): env_prob[stateprime,state,action]=1


# p(a|q) Here's the policty of the WSLS strategy
policy=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        if (action == node):
            policy[action,node]=1
        elif (node==2):
            policy[action,node]=0.5
        else:
            policy[action,node]=0

#f(o|s'a)
obs_prob=np.zeros((dimO,dimS,dimA))

for stateprime in states:
    for o in obs:
        for action in actions:
            if (stateprime==action): 
                obs_prob[o,stateprime,action]=p**o*(1.0-p)**(1-o)
            else:
                obs_prob[o,stateprime,action]=(1.0-p)**o*p**(1.0-o)

#g(q'|qao) Here's the memory update of the WSLS strategy
update=np.zeros((dimQ,dimA,dimQ,dimO))

for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                # if we win in the node corresponding to action, we stay
                if nodeprime == node and node == action and o == 1:
                    update[nodeprime,action,node,o] = 1
                # if we lose in the node corresponding to action, we shift to the intermediate node
                elif nodeprime == 2 and node == action and o == 0:
                    update[nodeprime,action,node,o] = 1
                # if we win in the intermediate node, we shift to node corresponding to action taken
                elif nodeprime == action and node == 2 and o == 1:
                    update[nodeprime,action,node,o] = 1 # P(a | 2) = 0.5
                # if we lose in the intermediate node, we shift to node corresponding to the other action
                elif nodeprime!= action and node == 2 and nodeprime != 2 and o == 0:
                    update[nodeprime,action,node,o] = 1 # P(a | 2) = 0.5
                # the rest is 0
                else:
                    update[nodeprime,action,node,o] = 0
                
#r(sa) Average reward
rew=np.zeros((dimS,dimA))

rew[0,0]=p
rew[0,1]=1-p
rew[1,0]=1-p
rew[1,1]=p


In [24]:
#b(sq)
init_node=0
gamma=0.5
b=np.zeros((dimQ,dimS))

for node in nodes:
    for state in states:
        for action in actions:
            b[node,state]+=rew[state,action]*policy[action,node]


#Bellman equation linear constraint
T=np.zeros((dimQ,dimS,dimQ,dimS))
Id=np.zeros((dimQ,dimS,dimQ,dimS))

#T(qs|q's')=sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))
for node in nodes:
    for state in states:
        for nodeprime in nodes:
            for stateprime in states:
                for o in obs:
                    for action in actions:
                        T[node,state,nodeprime,stateprime]+=update[nodeprime,action,node,o]*policy[action,node]*env_prob[stateprime,state,action]*obs_prob[o,stateprime,action]

#Id=delta(ss',qq')
for nodeprime in nodes:
    for stateprime in states:
        for node in nodes:
            for state in states:
                if (state==stateprime and node==nodeprime): Id[node,state,nodeprime,stateprime]=1

#delta(ss',qq')- gamma*[sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))]
Btens=Id-gamma*T

#Solve the linear system to get the single value solution
v=np.linalg.tensorsolve(Btens,b)
print("Value:",v)
print("=============")

#Calculation of the objective function with fixed inotial node
print("Objective function:", 0.5*np.sum(v[init_node,:]))

#Def normalized x(q',a,q,o)
x=np.zeros((dimQ,dimA,dimQ,dimO))
for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                x[nodeprime,action,node,o]=update[nodeprime,action,node,o]*policy[action,node]

Value: [[1.53125 0.75   ]
 [0.75    1.53125]
 [1.1875  1.1875 ]]
Objective function: 1.140625


In [25]:
import numpy as np
from f2py_jit import jit
f90=jit("SA_QCLP_ut_02_cor1.f90",flags="-O3 -fcheck=bounds")

#initial value for the x(q'aqo)
x_init=x
#initial value for the y(qs)
y_init=v
#Initial value of objective function
val_init=0.5*np.sum(y_init[init_node,:])

print("Initial mean value   >>>>>>>>>>")
print(val_init)
print("Initial policy       >>>>>>>>>>")
print(policy)

#Initial parameters
T0=10
T_end=1e-6
Temp_factor=0.3
factor=0.4
nstep=50
pen_step=350
N=20
stdev=1.0

val_min=-val_init

size_gauss_x=round(dimO*nstep*(1.0-factor))
size_cauchy_x=round(dimO*nstep*(factor))

size_gauss_y=round(dimS*nstep*(1.0-factor))
size_cauchy_y=round(dimS*nstep*(factor))

rndx=np.zeros((dimQ,dimA,dimQ,pen_step*dimO*nstep*round(1+np.log(T_end/T0)/np.log(Temp_factor))))
rndy=np.zeros((dimQ,pen_step*dimS*nstep*round(1+np.log(T_end/T0)/np.log(Temp_factor))))


for p in range(N):

    sigma=2.0
    mu=2.0

    for j in range(pen_step*round(1+np.log(T_end/T0)/np.log(Temp_factor))):

        #x(q'aqo) sample size and generation

        #Generarion of gauss sample. In this way the point generated is independent from o
        x_gauss=np.random.normal(0.0,stdev,size=(dimQ,dimA,dimQ,size_gauss_x))
        x_cauchy=np.random.standard_cauchy(size=(dimQ,dimA,dimQ,size_cauchy_x))
        a=np.append(x_cauchy,x_gauss,axis=3)
        rndx[:,:,:,j*dimO*nstep:(j+1)*dimO*nstep]=a
        #Generating the effective gauss_sample                 

        #y(qs) sample size and generation

        y_cauchy=np.random.standard_cauchy(size=(dimQ,size_cauchy_y))
        y_gauss=np.random.normal(0.0,stdev,size=(dimQ,size_gauss_y))
        b=np.append(y_cauchy,y_gauss,axis=1)
        rndy[:,j*dimO*nstep:(j+1)*dimO*nstep]=b
    
    #Simulated Annealing 
    Res=f90.qclp.drive(T0,T_end,Temp_factor,nstep,pen_step,x_init,y_init,val_init,rew,env_prob,obs_prob,sigma,mu,rndx,rndy,gamma)
    
    if (Res[2] < val_min):
        x_init=Res[0]
        x_min=Res[0]
        y_init=Res[1]
        y_min=Res[1]
        val_init=Res[2]
        val_min=Res[2]

    #print(p,"Gone. Objective function:", Res[2])

print("value                >>>>>>>>>>>")
print(y_min)
print("mean value           >>>>>>>>>>>")
print(0.5*np.sum(y_min[init_node,:]))
print("objective function   >>>>>>>>>>>")
print(val_min)


Initial mean value   >>>>>>>>>>
1.140625
Initial policy       >>>>>>>>>>
[[1.  0.  0.5]
 [0.  1.  0.5]]


value                >>>>>>>>>>>
[[1.47140654 0.87772462]
 [0.82853078 1.4750692 ]
 [1.13858568 1.07768525]]
mean value           >>>>>>>>>>>
1.1745655771555221
objective function   >>>>>>>>>>>
-1.1743030606318074


In [26]:
print(y_min)
Pol0=np.zeros((dimA,dimQ))
Pol1=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        for nodeprime in nodes:
            Pol0[action,node]=np.sum(x_min[:,action,node,0])

for action in actions:
    for node in nodes:
        Pol1[action,node]=np.sum(x_min[:,action,node,1])


print(Pol0)
print(Pol1)

normalization=np.zeros((dimQ,dimO))

for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                normalization[node,o]+=x_min[nodeprime,action,node,o]

print("norm",normalization)



[[1.47140654 0.87772462]
 [0.82853078 1.4750692 ]
 [1.13858568 1.07768525]]
[[9.99934733e-01 1.70587378e-02 4.40111013e-01]
 [6.52668852e-05 9.82941262e-01 5.59888987e-01]]
[[9.99935236e-01 1.70586996e-02 4.40110290e-01]
 [6.47637846e-05 9.82941300e-01 5.59889710e-01]]
norm [[1. 1.]
 [1. 1.]
 [1. 1.]]


In [27]:
for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                if (x_min[nodeprime,action,node,o]/Pol1[action,node] > 1e-4): 
                    print("INIT NODE:",node,">>>","ACTION:",action,">>>","OBS:",o,">>>","FINAL NODE:",nodeprime,"---->",x_min[nodeprime,action,node,o]/Pol0[action,node])
                    print(".")

INIT NODE: 0 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.0004300658655291125
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.9453351924189224
.
INIT NODE: 0 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 1 ----> 0.999558134861155
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 1 ----> 0.05364852122386734
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 2 ----> 0.001016286357210175
.
INIT NODE: 0 >>> ACTION: 0 >>> OBS: 1 >>> FINAL NODE: 0 ----> 0.9999924583603992
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 0 ----> 0.6182985373759196
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 1 ----> 0.33041307453736146
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 2 ----> 0.043580030167369235
.
INIT NODE: 1 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.004107510345182457
.
INIT NODE: 1 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.6591882898052094
.
INIT NODE: 1 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 1 ----> 0.9732423259541

In [28]:
No=np.zeros((dimA,dimQ,dimO))

for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                No[action,node,o]+=x_min[nodeprime,action,node,o]/Pol0[action,node]


for action in actions:
    for node in nodes:
        for o in obs:
            print(action,node,o,No[action,node,o])

0 0 0 0.9999999999999999
0 0 1 1.000000503133349
0 1 0 0.9999999999999999
0 1 1 0.9999977613282003
0 2 0 1.0
0 2 1 0.999998357193581
1 0 0 0.9999999999999999
1 0 1 0.9922916420806502
1 1 0 1.0
1 1 1 1.0000000388516757
1 2 0 1.0
1 2 1 1.0000012913581333


Controller size=4

In [29]:
dimQ=4
policy=np.zeros((dimA,dimQ))
x=np.zeros((dimQ,dimA,dimQ,dimO))

states=np.arange(0,dimS,dtype=int)
actions=np.arange(0,dimA,dtype=int)
nodes=np.arange(0,dimQ,dtype=int)
obs=np.arange(0,dimO,dtype=int)

for node in nodes:
        for action in actions:
            for nodeprime in nodes:
                for o in obs:
                    # from node 0 having picked first action and won, we stay
                    if node ==  0 and nodeprime == 0 and action == 0 and o == 1:
                        x[nodeprime,action,node,o] = 1
                    # from node 0 having picked first action and lost, we shift to node 1
                    elif node== 0 and nodeprime == 1 and action == 0 and o == 0:
                        x[nodeprime,action,node,o] = 1
                    # from node 1 having picked first action and won, stochastic transition
                    elif node == 1 and nodeprime == 0 and action == 0 and o == 1:
                        x[nodeprime,action,node,o] = 0.5
                    elif node == 1 and nodeprime == 2 and action == 0 and o == 1:
                        x[nodeprime,action,node,o] = 0.5
                    # from node 1 having picked first action and lost again, we shift to node 2
                    elif node == 1 and nodeprime == 2 and action == 0 and o == 0:
                        x[nodeprime,action,node,o] = 1
                    # from node 2 having picked the second action and won, we stay
                    elif node== 2 and nodeprime == 2 and action == 1 and o == 1:
                        x[nodeprime,action,node,o] = 1
                     # from node 2 having picked the second action and lost, we shift to node 3
                    elif node== 2 and nodeprime == 3 and action == 1 and o == 0:
                        x[nodeprime,action,node,o] = 1
                    # from node 3 having picked the second action and won, stochastic transition
                    elif node == 3 and nodeprime == 2 and action == 1 and o == 1:
                        x[nodeprime,action,node,o] = 0.5
                    elif node == 3 and nodeprime == 0 and action == 1 and o == 1:
                        x[nodeprime,action,node,o] = 0.5
                    # from node 3 having picked the second action and lost again, we shift to node 0
                    elif node == 3 and nodeprime == 0 and action == 1 and o == 0:
                        x[nodeprime,action,node,o] = 1
                    # else = 0
                    else:
                        x[nodeprime,action,node,o] = 0

for action in actions:
    for node in nodes:
        for nodeprime in nodes:
            policy[action,node]=np.sum(x[:,action,node,0])

#b(sq)
init_node=0
gamma=0.5
b=np.zeros((dimQ,dimS))

for node in nodes:
    for state in states:
        for action in actions:
            b[node,state]+=rew[state,action]*policy[action,node]


#Bellman equation linear constraint
T=np.zeros((dimQ,dimS,dimQ,dimS))
Id=np.zeros((dimQ,dimS,dimQ,dimS))

#T(qs|q's')=sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))
for node in nodes:
    for state in states:
        for nodeprime in nodes:
            for stateprime in states:
                for o in obs:
                    for action in actions:
                        T[node,state,nodeprime,stateprime]+=x[nodeprime,action,node,o]*env_prob[stateprime,state,action]*obs_prob[o,stateprime,action]

#Id=delta(ss',qq')
for nodeprime in nodes:
    for stateprime in states:
        for node in nodes:
            for state in states:
                if (state==stateprime and node==nodeprime): Id[node,state,nodeprime,stateprime]=1

#delta(ss',qq')- gamma*[sum_a sum_y (p(s'|sa)*f(o|sa)*g(q'|qao)*policy(a|q))]
Btens=Id-gamma*T

#Solve the linear system to get the single value solution
v=np.linalg.tensorsolve(Btens,b)
print("Value:",v)
print("=============")

#Calculation of the objective function with fixed inotial node
print("Objective function:", 0.5*np.sum(v[init_node,:]))


Value: [[1.55009901 0.63524752]
 [1.30059406 0.92930693]
 [0.63524752 1.55009901]
 [0.92930693 1.30059406]]
Objective function: 1.0926732673267328


In [36]:
import numpy as np
from f2py_jit import jit
f90=jit("SA_QCLP_ut_02_cor1.f90",flags="-O3 -fcheck=bounds")

#initial value for the x(q'aqo)
x_init=x
#initial value for the y(qs)
y_init=v
#Initial value of objective function
val_init=0.5*np.sum(y_init[init_node,:])

print("Initial mean value   >>>>>>>>>>")
print(val_init)
print("Initial policy       >>>>>>>>>>")
print(policy)

#Initial parameters
T0=10
T_end=1e-6
Temp_factor=0.5
factor=0.4
nstep=100
pen_step=350
N=20
stdev=1.0

val_min=-val_init

size_gauss_x=round(dimO*nstep*(1.0-factor))
size_cauchy_x=round(dimO*nstep*(factor))

size_gauss_y=round(dimS*nstep*(1.0-factor))
size_cauchy_y=round(dimS*nstep*(factor))

rndx=np.zeros((dimQ,dimA,dimQ,pen_step*dimO*nstep*round(1+np.log(T_end/T0)/np.log(Temp_factor))))
rndy=np.zeros((dimQ,pen_step*dimS*nstep*round(1+np.log(T_end/T0)/np.log(Temp_factor))))


for p in range(N):

    sigma=2.0
    mu=2.0

    for j in range(pen_step*round(1+np.log(T_end/T0)/np.log(Temp_factor))):

        #x(q'aqo) sample size and generation

        #Generarion of gauss sample. In this way the point generated is independent from o
        x_gauss=np.random.normal(0.0,stdev,size=(dimQ,dimA,dimQ,size_gauss_x))
        x_cauchy=np.random.standard_cauchy(size=(dimQ,dimA,dimQ,size_cauchy_x))
        a=np.append(x_cauchy,x_gauss,axis=3)
        rndx[:,:,:,j*dimO*nstep:(j+1)*dimO*nstep]=a
        #Generating the effective gauss_sample                 

        #y(qs) sample size and generation

        y_cauchy=np.random.standard_cauchy(size=(dimQ,size_cauchy_y))
        y_gauss=np.random.normal(0.0,stdev,size=(dimQ,size_gauss_y))
        b=np.append(y_cauchy,y_gauss,axis=1)
        rndy[:,j*dimO*nstep:(j+1)*dimO*nstep]=b
    
    #Simulated Annealing 
    Res=f90.qclp.drive(T0,T_end,Temp_factor,nstep,pen_step,x_init,y_init,val_init,rew,env_prob,obs_prob,sigma,mu,rndx,rndy,gamma)
    
    if (Res[2] < val_min):
        x_init=Res[0]
        x_min=Res[0]
        y_init=Res[1]
        y_min=Res[1]
        val_init=Res[2]
        val_min=Res[2]

    #print(p,"Gone. Objective function:", Res[2])

print("value                >>>>>>>>>>>")
print(y_min)
print("mean value           >>>>>>>>>>>")
print(0.5*np.sum(y_min[init_node,:]))
print("objective function   >>>>>>>>>>>")
print(val_min)


Initial mean value   >>>>>>>>>>
1.0926732673267328
Initial policy       >>>>>>>>>>
[[1. 1. 0. 0.]
 [0. 0. 1. 1.]]
value                >>>>>>>>>>>
[[1.47971855 0.88013575]
 [0.9814883  1.15856767]
 [0.87836575 1.480302  ]
 [1.04502092 1.08841582]]
mean value           >>>>>>>>>>>
1.1799271486641665
objective function   >>>>>>>>>>>
-1.1794209678688907


In [40]:
print(y_min)
Pol0=np.zeros((dimA,dimQ))
Pol1=np.zeros((dimA,dimQ))

for action in actions:
    for node in nodes:
        for nodeprime in nodes:
            Pol0[action,node]=np.sum(x_min[:,action,node,0])

for action in actions:
    for node in nodes:
        Pol1[action,node]=np.sum(x_min[:,action,node,1])


print(Pol0)
print(Pol1)

normalization=np.zeros((dimQ,dimO))

for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                normalization[node,o]+=x_min[nodeprime,action,node,o]

print("norm",normalization)



[[1.47971855 0.88013575]
 [0.9814883  1.15856767]
 [0.87836575 1.480302  ]
 [1.04502092 1.08841582]]
[[9.99994962e-01 3.75698284e-01 1.40010383e-05 5.20459050e-01]
 [5.03831290e-06 6.24301716e-01 9.99985999e-01 4.79540950e-01]]
[[9.99994286e-01 3.75698424e-01 1.41393454e-05 5.20458808e-01]
 [5.71352928e-06 6.24301576e-01 9.99985861e-01 4.79541192e-01]]
norm [[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]


In [41]:
for node in nodes:
    for o in obs:
        for nodeprime in nodes:
            for action in actions:
                if (x_min[nodeprime,action,node,o]/Pol1[action,node] > 1e-4): 
                    print("INIT NODE:",node,">>>","ACTION:",action,">>>","OBS:",o,">>>","FINAL NODE:",nodeprime,"---->",x_min[nodeprime,action,node,o]/Pol0[action,node])
                    print(".")

INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.14912330435400264
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 1 ----> 0.2009944370437969
.
INIT NODE: 0 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 2 ----> 0.999988544671814
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 2 ----> 0.3226835541340621
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 3 ----> 0.3271987044681385
.
INIT NODE: 0 >>> ACTION: 0 >>> OBS: 1 >>> FINAL NODE: 0 ----> 0.9999817264832426
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 0 ----> 0.8924204688592045
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 1 ----> 0.05845135252415918
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 2 ----> 0.041157095031159614
.
INIT NODE: 0 >>> ACTION: 1 >>> OBS: 1 >>> FINAL NODE: 3 ----> 0.14198745001188817
.
INIT NODE: 1 >>> ACTION: 0 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.22163819738380275
.
INIT NODE: 1 >>> ACTION: 1 >>> OBS: 0 >>> FINAL NODE: 0 ----> 0.4173980282733184
.

In [43]:
No=np.zeros((dimA,dimQ,dimO))

for nodeprime in nodes:
    for action in actions:
        for node in nodes:
            for o in obs:
                No[action,node,o]+=x_min[nodeprime,action,node,o]/Pol0[action,node]


for action in actions:
    for node in nodes:
        for o in obs:
            print(action,node,o,No[action,node,o])

0 0 0 1.0
0 0 1 0.9999993247802107
0 1 0 0.9999999999999998
0 1 1 1.0000003720421562
0 2 0 1.0
0 2 1 1.0098783435062006
0 3 0 1.0
0 3 1 0.99999953568748
1 0 0 1.0
1 0 1 1.1340163664264113
1 1 0 1.0000000000000002
1 1 1 0.9999997761088971
1 2 0 1.0
1 2 1 0.9999998616909975
1 3 0 1.0
1 3 1 1.0000005039312143
