In [3]:
from plt_utils import *
from test_levels import *
import gym
print(gym.__version__)

0.26.2


## policy computation
- policy_update corresponds to $f_v$ <br>
- value_update(iterations =1) corresponds to $L_{\text{policy}}$ <br>
- value_update_seidel(iterations =1) uses updated values of v 
as soon that they are updated like gauss-seidel method. <br>

- value_iteration is applying $L_{f_v}v = Uv$ (theorem 1.3.5)  <br>
- backward_induction is value_iteration with $v^{T+1} =0$ and time dependence (theorem 1.2.1) <br>
- policy_iteration is generalized_iteration because of (theorem 1.3.6) <br>
we don't apply $L_{\pi}$ infinite times but to convergence or $1000$ 
iterations.

the code is obvious, there is a small difference in the formula <br> 
because rewards are defined differently so you take it in the sum <br>
that way you get averaged reward corresponding to our definition <br>
and you can decode $b$ with: <br>
env.P[state][action] = [(probability, nextstate, reward, done), ...] <br>
(To make backward induction time dependent just make env.P[state][action] 
time dependent.)


In [4]:
def states(env): return range(env.observation_space.n)
def actions(env, state): return env.P[state].keys()
# infmetric, norm, only works for v
def oometric(v1, v2): return max(abs(v1[i]-v2[i]) for i in v1.keys())

def value_update(env, alpha, v, policy, iterations=1, eps=10**(-4)):
    for _ in range(iterations): 
        vold = v.copy() 
        for state in states(env): 
            v[state] = sum(b[0]*(b[2]+alpha*vold[b[1]]) 
                           if not(b[3]) else b[0]*b[2] for b in env.P[state][policy[state]])
        if oometric(v, vold) < eps: break #convergence

def value_update_seidel(env, alpha, v, policy, iterations=1, eps=10**(-4)):
    for _ in range(iterations): 
        vold = v.copy() 
        for state in states(env): 
            v[state] = sum(b[0]*(b[2]+alpha*v[b[1]]) # v instead vold, an OG error  for finite problem
                           if not(b[3]) else b[0]*b[2] for b in env.P[state][policy[state]])
        if oometric(v, vold) < eps: break #convergence


def policy_update(env, alpha, v, policy):
    for state in states(env):
        max_a, max_val = 0, -float("inf") #to select the argmax
        for action in actions(env, state):
            val = sum(b[0]*(b[2]+alpha*v[b[1]]) if not(b[3]) else b[0]*b[2] for b in env.P[state][action])
            max_a, max_val = (action, val) if val > max_val else (max_a, max_val)
        policy[state] = max_a

def value_iteration(env,alpha,max_iter = 30,eps = 10**(-6)):
    v = {state:0 for state in states(env)} 
    policy = {state:0 for state in states(env)} 
    vv,pp = [],[] #these are for plotting

    for i in range(max_iter):
        policy_update(env,alpha,v,policy)
        value_update(env,alpha,v,policy,iterations=1)
        vv.append(v.copy()) # history of value functions
        pp.append(policy.copy()) # policy history
        if i>2 and oometric(vv[-1], vv[-2]) < eps: break
    return vv,pp

def backward_induction(env,alpha,T):
    return value_iteration(env,alpha,T,0)

def generalized_iteration(env,alpha,inner_iter=1,max_iter = 30,eps = 10**(-6)):
    v = {state:0 for state in states(env)} 
    policy = {state:0 for state in states(env)} 
    vv,pp = [],[] #these are for plotting

    for i in range(max_iter):
        value_update(env,alpha,v,policy,inner_iter,eps)
        policy_update(env,alpha,v,policy)
        vv.append(v.copy()) # history of value functions
        pp.append(policy.copy()) # policy history
        if i>2 and pp[-1]==pp[-2]==pp[-3]: break
    return vv,pp

def policy_iteration(env,alpha,max_iter = 30,eps = 10**(-6)):
    return generalized_iteration(env=env,alpha=alpha,inner_iter=10**3,max_iter = max_iter,eps = eps)

In [86]:
env = time_level()
# env = envrandom()
# env=gym.make('CliffWalking-v0')
# env= gym.make('Taxi-v3') # use value iteration
# vv,pp = backward_induction(env,alpha=1,T=1000) #programming task 1
# vv,pp = policy_iteration(env,alpha=0.999,max_iter=50,eps=10**(-6)) #programming task 2
vv,pp = value_iteration(env,alpha=0.999,max_iter=300,eps = 0.001) #programming task 3
sol = list(pp[-1].items()) # asked form of the policy in class
print(sol)
print(f"val(0) = {vv[-1][0]} at time 1 for finite horizon")
# intvp(vv,pp)

[(0, 4), (1, 4), (2, 4), (3, 4), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 5), (17, 0), (18, 0), (19, 0), (20, 3), (21, 3), (22, 3), (23, 3), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 3), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (42, 0), (43, 0), (44, 2), (45, 2), (46, 2), (47, 2), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (56, 0), (57, 2), (58, 0), (59, 0), (60, 0), (61, 0), (62, 0), (63, 0), (64, 2), (65, 2), (66, 2), (67, 2), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (76, 0), (77, 2), (78, 0), (79, 0), (80, 0), (81, 0), (82, 0), (83, 0), (84, 4), (85, 4), (86, 4), (87, 4), (88, 0), (89, 0), (90, 0), (91, 0), (92, 0), (93, 0), (94, 0), (95, 0), (96, 0), (97, 5), (98, 0), (99, 0), (100, 1), (101, 1), (102, 1), (103, 1), (104, 0), (105, 0), (106, 0), (107, 0), (108, 0), (109, 0), (110, 0),

## policy evaluation
- value_eval is for evaluating a policy, basically it is a big  <br>
average of the rewards times the probability of getting it <br>
lot of terms can be reused, value_update does it basically  <br>
- MC_eval just is MC estimation of the total reward

In [57]:
def finite_value_eval(env,alpha,pp):
    v = {state:0 for state in states(env)} 
    vv= []
    for p in pp:
        value_update(env,alpha,v,p,1,0)
        vv.append(v.copy())
    print(f"finite val(0) = {vv[-1][0]}")
    return vv

def infinite_value_eval(env,alpha,policy,eps=10**(-6)):
    v = {state:0 for state in states(env)} 
    value_update(env,alpha,v,policy,10**5,eps)
    print(f"infinite val(0) = {v[0]}")
    return v 

# does MC estimation of the expected reward
def MC_eval(env,pol,alpha,T=10**3,nsim = 10**3):
    running_sum = 0
    for _ in range(nsim):
        stat = env.reset()
        # state = env.reset() #google colab
        total_reward = 0
        state = stat[0]
        for t in range(T): 
            action = pol[t][state]
            state, reward, done, _, _ = env.step(action)
            total_reward += alpha**t*reward
            if done: break
        env.close()
        running_sum +=total_reward/nsim
    print(f"infinite MC val(0) = {running_sum}")
    return running_sum

def random_policy(env,T):
    return [{state:env.action_space.sample() 
            for state in states(env)}
            for _ in range(T)]

def infinite_pol_rep(p,T):
    return {t: {i:p[i] 
              for i in range(env.observation_space.n)}
                for t in range(T)}

def finite_pol_rep(pp,T):
    return {t: {i:pp[-t-1][i] 
              for i in range(env.observation_space.n)}
                for t in range(T)}

In [72]:
env = time_level()
# env = envrandom()
alpha = 0.999
T = 10**3
pvv,ppp = policy_iteration(env,alpha=alpha,max_iter=50,eps=10**(-6)) 
infinite_value_eval(env,alpha,ppp[-1])
MC_eval(env,infinite_pol_rep(ppp[-1],T),alpha=alpha,T=T,nsim=10**3)
print(f"reference = {pvv[-1][0]}")

infinite val(0) = 0.5164902723354029
infinite MC val(0) = 0.5050410388335955
reference = 0.5164920595206823


In [71]:
env = time_level()
alpha=0.999
T = 30
bvv,bpp = backward_induction(env,alpha=alpha,T=T) 
finite_value_eval(env,alpha=alpha,pp=bpp)
MC_eval(env,finite_pol_rep(bpp,T),alpha=alpha,T=T,nsim=10**3)
print(f"reference = {bvv[-1][0]}")

finite val(0) = 0.010602851206551666
infinite MC val(0) = 0.008758193928672796
reference = 0.010602851206551666


In [74]:
env = trivial()
alpha=0.999
T = 200
rpp = random_policy(env,T)
finite_value_eval(env,alpha,rpp)
MC_eval(env,finite_pol_rep(rpp,T),alpha=alpha,T=T,nsim=10**3)
print()

finite val(0) = 0.35558098294602347
infinite MC val(0) = 0.39182276166057983



## $\varepsilon$-greedy Q-learning

We assume we know the states and the actions ...,
we start of from a value function because otherwise 
sparse rewards are a problem 

In [38]:
from random import randint
def get_action_from_q(q,state):
    i = randint(0,3)
    max_a , max_val = i, q[state][i]
    for action,val in q[state].items():
        max_a, max_val = (action, val) if val > max_val else (max_a, max_val)
    return max_a,max_val    

def get_q_from_v(env,v,alpha):
    q = {state:{action:0 for action in actions(env,state) } for state in states(env)}
    for state in states(env):
        for action in actions(env,state): 
            q[state][action]= sum(b[0]*(b[2]+alpha*v[b[1]]) if not(b[3]) else b[0]*b[2] for b in env.P[state][action]) # uses P
    return q

def get_policy_from_q(env,q): # Programming task 5 (B)
    return {state:get_action_from_q(q,state)[0] for state in states(env)}

def Q_learning(env,v,alpha,gamma,T=10**3,nsim = 10**3):
    q = get_q_from_v(env,v,alpha) # uses P
    for i in range(nsim):
        stat = env.reset()
        # state = env.reset() #google colab
        state = stat[0]
        for t in range(T): 
            action, _ = get_action_from_q(q,state) if randint(0,10)!=0 else (randint(0,3),1)
            state, reward, done, _, _ = env.step(action)
            _, qval2 = get_action_from_q(q,state)
            q[state][action] = (1-gamma)*(q[state][action])+ gamma*(reward+alpha*qval2)
            if done: break
        env.close()
    return q 

In [62]:
# env = time_level()
env = envrandom()
alpha = 1
rp = random_policy(env,1)[0]
rv = infinite_value_eval(env,alpha,rp) #uses P
q= Q_learning(env,rv,alpha=alpha,gamma=0.2,T=2*10**2,nsim=10**3) # Programming task 5
qp = get_policy_from_q(env,q)

# Programming task 5 (C) and (D)
qv = infinite_value_eval(env,alpha,qp)
dv = {state:(qv[state]-rv[state]) for state in states(env)}
# intvp([rv], [rp])
print("This interactive plot gives improvement of the value")
intvp([dv], [qp])
# visq(q)

infinite val(0) = 0.0
infinite val(0) = 0.2162460422496738
This interactive plot gives improvement of the value


interactive(children=(IntSlider(value=0, description='iterations', max=0), IntSlider(value=0, description='row…