In [1]:
import numpy as np

In [2]:
class GridWorld:
    def __init__(self) :
        self.row = 5
        self.column = 5
        self.i = 4
        self.j = 0
        self.config()

    def config(self):
        reward = {}

        for p in range(self.row):
            for q in range(self.column):
                if p==0 and q==4:
                    reward[(p,q)] = 1
                elif p==1 and q==4:
                    reward[(p,q)] = -1
                elif p==2 and q==1:
                    reward[(p,q)] = 0
                elif p==3 and q==3:
                    reward[(p,q)] = 0
                else :
                    reward[(p,q)] = 0

        action = {(0, 0): {'D','R'},
        (0, 1):{'L','D','R'},
        (0, 2): {'L','D','R'},
        (0, 3): {'L','D','R'},
        (1, 0): {'U','R','D'},
        (1, 1): {'U','D','R','L'},
        (1, 2): {'U','D','R','L'},
        (1, 3): {'U','D','R','L'},
        (2, 0): {'U','R','D'},
        (2, 1): {'U','D','R','L'},
        (2, 2): {'U','D','R','L'},
        (2, 3): {'U','D','R','L'},
        (2, 4): {'D','U','L'},
        (3, 0): {'U','R','D'},
        (3, 1): {'U','D','R','L'},
        (3, 2): {'U','D','R','L'},
        (3, 3): {'U','D','R','L'},
        (3, 4): {'D','L','U'},
        (4, 0): {'U','R'},
        (4, 1): {'U','R','L'},
        (4, 2): {'U','R','L'},
        (4, 3): {'U','R','L'},
        (4, 4): {'U','L'},}

        self.set(reward,action)

    def set(self,reward, action):
        self.reward = reward
        self.action = action

    def get_nextState(self,si,sj,action):
        if action == "U":
            si -= 1
        elif action == "D":
            si += 1
        elif action == "R":
            sj += 1
        elif action == "D":
            sj -= 1
        return si,sj

    def get_reward(self,si,sj):
        return self.reward[(si,sj)]

    def move(self,action):
        self.i,self.j = self.get_nextState(self.i,self.j,action)
        return self.get_reward(self.i,self.j)
    
    def terminate(self):
        if (self.i,self.j) in self.action:
            return False
        return True
    
    def current_state(self):
        return self.i,self.j
    
    def is_terminal(self,i,j):
        if (i,j) in self.action:
            return False
        return True

In [3]:
g = GridWorld()

In [4]:
policy = {
        (0, 0): 'R',
        (0, 1):'R',
        (0, 2): 'R',
        (0, 3): 'R',
        (1, 0): 'U',
        (1, 1): 'U',
        (1, 2): 'U',
        (1, 3): 'U',
        (2, 0): 'U',
        (2, 2): 'U',
        (2, 3): 'U',
        (2, 4): 'L',
        (3, 0): 'U',
        (3, 1): 'R',
        (3, 2): 'U',
        (3, 4): 'U',
        (4, 0): 'U',
        (4, 1): 'U',
        (4, 2): 'U',
        (4, 3): 'L',
        (4, 4): 'L',
        (0, 4): 'D',
        (1,4) : 'U'}

In [5]:
#transition and reward function of S' / s2
transition = {}
reward_s2 = {}

for i in range(g.row):
    for j in range(g.column):
        if(i,j) in g.action:
            for action in g.action[(i,j)]:
                s2i,s2j = g.get_nextState(i,j,action)
                transition[(i,j,action,s2i,s2j)]=1
                reward_s2[(i,j,action,s2i,s2j)] = g.get_reward(s2i,s2j)



In [6]:
#print policy

print("POLICY")
for i in range(g.row):
    print ("---"*7)
    print("|",end=" ")
    for j in range(g.column):
        if (i,j) in policy:
            print(policy[(i,j)],"|",end=" ")
        else:
            print(" ","|",end=" ")
    print("")
print ("---"*7)



POLICY
---------------------
| R | R | R | R | D | 
---------------------
| U | U | U | U | U | 
---------------------
| U |   | U | U | L | 
---------------------
| U | R | U |   | U | 
---------------------
| U | U | U | L | L | 
---------------------


In [7]:
#print value
def plot_value(V):
    print("Value")
    for i in range(g.row):
        print ("---"*7)
        print("|",end=" ")
        for j in range(g.column):
            print("%.2f|" %V[(i,j)],end=" ")
        print("")
    print ("---"*7)

In [8]:
#initiate Value
V = {}

for p in range(g.row):
    for q in range(g.column):
            V[(p,q)] = 0

delta = 0.001
gamma = 0.9

In [9]:
# Run the model

temp = {}
iter = 0
while True:
    diff = 0
    for si in range(g.row):
        for sj in range(g.column):
            if not g.is_terminal(si,sj):
                temp = 0
                for action in g.action[(si,sj)]:
                    s2i,s2j = g.get_nextState(si,sj,action)
                    pi = 0
                    if (si,sj) in policy:
                        if policy[(si,sj)]==action:
                            pi = 1
                    temp += pi * transition[(si,sj,action,s2i,s2j)]*(reward_s2[(si,sj,action,s2i,s2j)]+ gamma * V[(s2i,s2j)])
                
                diff = max(diff,np.abs(V[(si,sj)]-temp))
                V[(si,sj)]=temp
    iter = iter+1
    print("Itereation :", iter, "      Diff :",diff)
    plot_value(V)
    if diff < delta:
        break
    

Itereation : 1       Diff : 1.0
Value
---------------------
| 0.00| 0.00| 0.00| 1.00| 0.00| 
---------------------
| 0.00| 0.00| 0.00| 0.90| 0.00| 
---------------------
| 0.00| 0.00| 0.00| 0.81| 0.00| 
---------------------
| 0.00| 0.00| 0.00| 0.00| 0.00| 
---------------------
| 0.00| 0.00| 0.00| 0.00| 0.00| 
---------------------
Itereation : 2       Diff : 0.9
Value
---------------------
| 0.00| 0.00| 0.90| 1.00| 0.00| 
---------------------
| 0.00| 0.00| 0.81| 0.90| 0.00| 
---------------------
| 0.00| 0.00| 0.73| 0.81| 0.00| 
---------------------
| 0.00| 0.00| 0.66| 0.00| 0.00| 
---------------------
| 0.00| 0.00| 0.59| 0.00| 0.00| 
---------------------
Itereation : 3       Diff : 0.81
Value
---------------------
| 0.00| 0.81| 0.90| 1.00| 0.00| 
---------------------
| 0.00| 0.73| 0.81| 0.90| 0.00| 
---------------------
| 0.00| 0.00| 0.73| 0.81| 0.00| 
---------------------
| 0.00| 0.59| 0.66| 0.00| 0.00| 
---------------------
| 0.00| 0.53| 0.59| 0.00| 0.00| 
----------------