In [1]:
from plt_utils import *
from test_levels import *
import gym

print(gym.__version__)
env = time_level()

0.26.2


# Given env.P

## policy computation
- policy_update corresponds to $f_v$ <br>
- value_update(iterations =1) corresponds to $L_{\text{policy}}$ <br>

- value_iteration is applying $L_{f_v}v = Uv$ (theorem 1.3.5)  <br>
- backward_induction is value_iteration with $v^{T+1} =0$ and time dependence (theorem 1.2.1) <br>
- policy_iteration is generalized_iteration because of (theorem 1.3.6) <br>
we don't apply $L_{\pi}$ infinite times but to convergence or $1000$ 
iterations.

the code is obvious, there is a small difference in the formula <br> 
because rewards are defined differently so you take it in the sum <br>
that way you get averaged reward corresponding to our definition <br>
and you can decode $b$ with: <br>
env.P[state][action] = [(probability, nextstate, reward, done), ...] <br>
(To make backward induction time dependent just make env.P[state][action] 
time dependent.)


In [24]:
def states(env): return range(env.observation_space.n)
def actions(env, state): return env.P[state].keys()
# infmetric, norm, only works for v
def oometric(v1, v2): return max(abs(v1[i]-v2[i]) for i in v1.keys())

def value_update(env, alpha, v, policy, iterations=1, eps=10**(-4)):
    for _ in range(iterations): 
        vold = v.copy() 
        for state in states(env): 
            v[state] = sum(b[0]*(b[2]+alpha*v[b[1]]) 
                           if not(b[3]) else b[0]*b[2] for b in env.P[state][policy[state]])
        if oometric(v, vold) < eps: break #convergence



def policy_update(env, alpha, v, policy):
    for state in states(env):
        max_a, max_val = 0, -float("inf") #to select the argmax
        for action in actions(env, state):
            val = sum(b[0]*(b[2]+alpha*v[b[1]]) for b in env.P[state][action])
            max_a, max_val = (action, val) if val > max_val else (max_a, max_val)
        policy[state] = max_a

def value_iteration(env,alpha,max_iter = 30,eps = 10**(-6)):
    v = {state:0 for state in states(env)} 
    policy = {state:0 for state in states(env)} 
    vv,pp = [],[] #these are for plotting

    for i in range(max_iter):
        policy_update(env,alpha,v,policy)
        value_update(env,alpha,v,policy,iterations=1)
        vv.append(v.copy()) # history of value functions
        pp.append(policy.copy()) # policy history
        if i>2 and oometric(vv[-1], vv[-2]) < eps: break
    return vv,pp

def backward_induction(env,alpha,T):
    return value_iteration(env,alpha,T,0)

def generalized_iteration(env,alpha,inner_iter=1,max_iter = 30,eps = 10**(-6)):
    v = {state:0 for state in states(env)} 
    policy = {state:0 for state in states(env)} 
    vv,pp = [],[] #these are for plotting

    for i in range(max_iter):
        value_update(env,alpha,v,policy,inner_iter,eps)
        policy_update(env,alpha,v,policy)
        vv.append(v.copy()) # history of value functions
        pp.append(policy.copy()) # policy history
        if i>2 and pp[-1]==pp[-2]==pp[-3]: break
    return vv,pp

def policy_iteration(env,alpha,max_iter = 30,eps = 10**(-6)):
    return generalized_iteration(env=env,alpha=alpha,inner_iter=10**3,max_iter = max_iter,eps = eps)

In [87]:
# vv,pp = backward_induction(env,alpha=1,T=30) 
# vv,pp = backward_induction(env,alpha=1,T=1000) #programming task 1
vv,pp = policy_iteration(env,alpha=0.999,max_iter=50,eps=10**(-6)) #programming task 2
# vv,pp = value_iteration(env,alpha=0.999,max_iter=300,eps = 0.001) #programming task 3
sol = list(pp[-1].items()) # asked form of the policy
print(sol)
print(f"val(0) = {vv[-1][0]} at time 1 for finite horizon")
intvp(vv,pp)

[(0, 0), (1, 3), (2, 3), (3, 3), (4, 3), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 1), (16, 0), (17, 0), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 0), (25, 0), (26, 2), (27, 2), (28, 3), (29, 2), (30, 1), (31, 1), (32, 0), (33, 0), (34, 2), (35, 0), (36, 0), (37, 2), (38, 1), (39, 1), (40, 0), (41, 0), (42, 2), (43, 0), (44, 0), (45, 2), (46, 1), (47, 1), (48, 0), (49, 0), (50, 2), (51, 0), (52, 0), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 0), (60, 0), (61, 2), (62, 2), (63, 0)]
val(0) = 0.5165094882368573 at time 1 for finite horizon


interactive(children=(IntSlider(value=0, description='iterations', max=9), IntSlider(value=0, description='row…

## policy evaluation
value_eval is for evaluating a policy, basically it is a big  <br>
average of the rewards times the probability of getting it <br>
lot of terms can be reused, value_update does it basically 

In [104]:
def finite_value_eval(env,alpha,pp):
    v = {state:0 for state in states(env)} 
    vv= []
    for p in pp:
        value_update(env,alpha,v,p,1,0)
        vv.append(v.copy())
    print(f"val(0) = {vv[-1][0]} at time 1 for finite horizon")
    return vv

def infinite_value_eval(env,alpha,policy,eps=10**(-6)):
    v = {state:0 for state in states(env)} 
    value_update(env,alpha,v,policy,10**5,eps)
    print(f"val(0) = {v[0]} for infinite horizon")
    return v 

def random_policy(env,T):
    return [{state:env.action_space.sample() 
            for state in states(env)}
            for _ in range(T)]


In [106]:
pvv,ppp = policy_iteration(env,alpha=0.999,max_iter=50,eps=10**(-6)) 
pv = infinite_value_eval(env,0.999,ppp[-1])
print(pvv[-1][0])

val(0) = 0.5165083584549253 for infinite horizon
0.5165094882368573


In [167]:
bvv,bpp = backward_induction(env,alpha=1,T=30) 
bbvv = finite_value_eval(env,1,bpp)
print(bvv[-1][0])

val(0) = 0.02869336917456404 at time 1 for finite horizon
0.02869336917456404


In [164]:
rpp = random_policy(env,2000)
rvv = finite_value_eval(env,1,rpp)

val(0) = 9.967856640248395e-05 at time 1 for finite horizon


# Simulation based
evaluation is easy MC estimation

In [9]:
print(pp[-1])

{0: 0, 1: 3, 2: 3, 3: 3, 4: 3, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 1, 16: 0, 17: 0, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 0, 25: 0, 26: 2, 27: 2, 28: 3, 29: 2, 30: 1, 31: 1, 32: 0, 33: 0, 34: 2, 35: 0, 36: 0, 37: 2, 38: 1, 39: 1, 40: 0, 41: 0, 42: 2, 43: 0, 44: 0, 45: 2, 46: 1, 47: 1, 48: 0, 49: 0, 50: 2, 51: 0, 52: 0, 53: 2, 54: 1, 55: 1, 56: 1, 57: 1, 58: 2, 59: 0, 60: 0, 61: 2, 62: 2, 63: 0}


In [161]:
def MC_eval(env,pol,alpha,T=10**3,nsim = 10**3):
    # does MC estimation of the expected reward
    running_sum = 0
    for _ in range(nsim):
        stat = env.reset()
        # state = env.reset() #google colab
        total_reward = 0
        state = stat[0]
        for t in range(T): 
            action = pol[t][state]
            state, reward, done, _, _ = env.step(action)
            total_reward +=alpha**t*reward
            if done: break
        env.close()
        running_sum +=total_reward/nsim
    return running_sum

def infinite_pol_rep(p,T):
    return {t: {i:p[i] 
              for i in range(env.observation_space.n)}
                for t in range(T)}

def finite_pol_rep(pp,T):
    return {t: {i:pp[-t-1][i] 
              for i in range(env.observation_space.n)}
                for t in range(T)}


In [159]:
pvv,ppp = policy_iteration(env,alpha=0.999,max_iter=50,eps=10**(-6)) 
T = 10**3 # basically infinity
rep_ppp = infinite_pol_rep(ppp[-1],T)
pMC = MC_eval(env,rep_ppp,0.999,T,10**3)
print(f"MC eval = {pMC}")
print(f"refrence = {pvv[-1][0]}")

MC eval = 0.5135162226085791
refrence = 0.5165094882368573


In [162]:
bvv,bpp = backward_induction(env,alpha=1,T=30) 
T = len(bpp)
rep_bpp = finite_pol_rep(bpp,T)
bMC = MC_eval(env,rep_bpp,1,T,10**3)
print(f"MC eval = {bMC}")
print(f"refrence = {bvv[-1][0]}")

MC eval = 0.012000000000000004
refrence = 0.02869336917456404
