In [12]:
import numpy as np
import networkx.algorithms.shortest_paths.astar as astar

R = -0.04 # reward
g = 0.9 # discount factor
e = 0.00001 # convergence factor

def update_util(grd, g, R, e):
    i = 0 # discount index
    
    while True:
        d = 0
        gprev = grd.copy()
        for i in range(3):
            for j in range(4):
                # policy instructions
                nexti = 0
                nextj = 0
                error_posi = 0
                error_posj = 0

                if gprev[i,j][1] == 8 or gprev[i,j][1] == 0: continue
                elif gprev[i,j][1] == 1:
                    nexti = i
                    nextj = j+1
                    error_posi = 1
                elif gprev[i,j][1] == -1:
                    nexti = i
                    nextj = j-1
                    error_posi = 1
                elif gprev[i,j][1] == 2:
                    nexti = i-1
                    nextj = j
                    error_posj = 1
                
                # check obstacle
                uval_a, uval_b = 0.0, 0.0
                if (i+error_posi) > 2 or (j+error_posj) > 3 or ((i+error_posi) == 1 and (j+error_posj) == 1):
                    uval_a = gprev[i,j][0]
                else:
                    uval_a = gprev[i+error_posi,j+error_posj][0]
                
                if (i-error_posi) < 0 or (j-error_posj) < 0 or ((i-error_posi) == 1 and (j-error_posj) == 1):
                    uval_b = gprev[i,j][0]
                else:
                    uval_b = gprev[i-error_posi,j-error_posj][0]
               
                # find utility value
                grd[i,j][0] = (g**i) * R + (0.8 * gprev[nexti,nextj][0] + 0.1 * uval_a + 0.1 * uval_b)
                
                if abs(grd[i,j][0] - gprev[i,j][0]) > d:
                    d = abs(gprev[i,j][0] - grd[i,j][0])
                
        if d < e*(1 - g)/g: break
        else: i += 1
    
class Node:
    def __init__(self, i, j, c, path = 0, f = 2, parent = None, children = []):
        self.i = i
        self.j = j
        self.c = c
        self.f = f
        self.path = path
        self.parent = parent
        self.children = children

In [13]:
"""Set the grid: each cell is of type [utility,policy:up|down|left|right]"""
# left = -1, right = 1, up = 2, down = -2, obstacle = 8, target = 0
grid = np.array([
    [[0.0,1],[0.0,1],[0.0,1],[1.0,0]],
    [[0.0,2],[0.0,8],[0.0,2],[-1.0,0]],
    [[0.0,2],[0.0,-1],[0.0,-1],[0.0,-1]]
])

print("Calculating utilities...")
update_util(grid, g, R, e)
print("Finished!")
print(grid)

Calculating utilities...
Finished!
[[[ 0.81273116  1.        ]
  [ 0.86835616  1.        ]
  [ 0.91835616  1.        ]
  [ 1.          0.        ]]

 [[ 0.76773116  2.        ]
  [ 0.          8.        ]
  [ 0.66520548  2.        ]
  [-1.          0.        ]]

 [[ 0.7221686   2.        ]
  [ 0.68166847 -1.        ]
  [ 0.643839   -1.        ]
  [ 0.42518966 -1.        ]]]


In [14]:
# Calculate A*
visited = []
opened = []
start = Node(2, 0, R)
goal = Node(0, 3, 1)

current = start
visited.append(current)

while current.i != goal.i or current.j != goal.j:
    left = current.j - 1
    right = current.j + 1
    up = current.i - 1
    down = current.i + 1

    pos = []
    for gr in opened:
        pos.append((gr.i,gr.j))

    if left >= 0:
        new_node = None
        if grid[current.i,left][1] != 0:
            new_node = Node(current.i, left, R, )
        if (current.i,left) not in pos and (current.i,left) != (1,1):
            opened.append(Node())
    
    if(right < 4):
        if (current.i,right) not in opened and (current.i,right) != (1,1):
            opened.append((current.i,right))
    
    if(up >= 0):
        if (up,current.j) not in opened and (up,current.j) != (1,1):
            opened.append((up,current.j))

    if(down < 3):
        if (down,current.j) not in opened and (down,current.j) != (1,1):
            opened.append((down,current.j))
    
