In [1]:
import numpy as np
import matplotlib.pyplot as plt 
from copy import deepcopy

In [2]:
states = ['s0','s1', 's2']
actions = ['a0', 'a1', 'a2']


P = { 0:np.array([
        [0.55,0.45,0],
        [0.3,0.7,0],
        [1,0,0]
    ]),
     1:np.array([
        [1,0,0],
        [0,0.4,0.6],
        [0,1,0]
    ]),
     2:np.array([
        [0,1,0],
        [0,0.6,0.4],
        [0,0,1]
    ])
     
}
# R[state,:] = [reward action 0, reward action 1, reward action 2]
R = np.array([
    [0,0,5./100],
    [0,0,0],
    [0,1,9./10]
])

###  Value Iteration 

In [3]:
N = 10000
V = np.zeros((N, 3))
V[0,:] = np.array([0.05, 0.0001, 0.08])
gamma = 0.95
for i in range(1,N):
    for s in range(3):
        Q = list()
        for a in range(3):
            sA = R[s,a]
            for sp in range(3):
                sA = sA + P[s][a, sp]*(gamma*V[i-1,sp])
            Q.append(sA)
        V[i,s] = np.max(Q)
    if np.linalg.norm(V[i-1,:] - V[i,:])<0.01:
        break
print(i, V[i,:])

100 [15.28540073 16.4426306  17.8942435 ]


In [4]:
for s in [0,1,2]:
    Q = list()
    for a in range(3):
        q = sum([P[s][a, sp]*(R[s,a] + gamma*V[i,sp]) for sp in range(3)])
        Q.append(q)
    print(s, np.argmax(Q))

0 1
1 1
2 2


### Policy Iteration 

In [5]:
N = 10000
V = np.array([0.05, 0.0001, 0.08])

policy = np.array([0,0,0])

gamma = 0.95
for i in range(1,N):
    policy2 = deepcopy(policy).astype(int)
    oldV = deepcopy(V)
    for s in range(3):
        V[s] = sum([P[s][policy2[s], sp]*(R[s,policy2[s]] + gamma*(oldV[sp])) for sp in range(3)])    
    for s in range(3):
        # Update policy
        currentq = V[s]
        for a in range(3):
            sA = R[s,a]
            q = sum([P[s][a, sp]*(R[s,a] + gamma*(V[sp])) for sp in range(3)])
            if q > currentq:
                policy[s] = a
                currentq = q
    #if np.sum(policy == policy2) == 3:
    #    break

### Q Learning 

In [6]:
from gridworld import GridWorld1
import gridrender as gui
import numpy as np
import time

env = GridWorld1

In [7]:
print(env.state2coord)
print(env.coord2state)
print(env.state_actions)
for i, el in enumerate(env.state_actions):
        print("s{}: {}".format(i, env.action_names[el]))

[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 3], [2, 0], [2, 1], [2, 2], [2, 3]]
[[ 0  1  2  3]
 [ 4 -1  5  6]
 [ 7  8  9 10]]
[[0, 1], [0, 2], [0, 1, 2], [0], [1, 3], [0, 1, 3], [0], [0, 3], [0, 2], [0, 2, 3], [2, 3]]
s0: ['right' 'down']
s1: ['right' 'left']
s2: ['right' 'down' 'left']
s3: ['right']
s4: ['down' 'up']
s5: ['right' 'down' 'up']
s6: ['right']
s7: ['right' 'up']
s8: ['right' 'left']
s9: ['right' 'left' 'up']
s10: ['left' 'up']


In [8]:
env.render = False

In [10]:
state = 0 
for i in range(5):
        action = np.random.choice(env.state_actions[state])
        nexts, reward, term = env.step(state,action)
        state = nexts
        print(state, reward, term)

0 0.0 False
4 0.0 False
0 0.0 False
1 0.0 False
0 0.0 False
