In [1]:
import numpy as np

#Task 1: implement the grid world.

In [3]:
class GridWorld:
    def __init__(self):
        self.grid_width = 4
        self.grid_height = 3
        self.start_state = (2, 0)
        self.goal_states = [(0, 3), (1, 3)]
        self.obstacles = [(1, 1)]
        self.action_space = ('UP', 'DOWN', 'RIGHT', "LEFT")
        self.current_state = self.start_state


    def reset(self):
        self.current_state = self.start_state
        return self.current_state  # Return the state tuple

    def is_terminal(self):
        return self.current_state in self.goal_states

    def is_valid_state(self, state):
        x, y = state
        if(state in self.obstacles):
            return False
        if (0 <= x < self.grid_height and 0 <= y < self.grid_width):
            return True
        return False
    
    def reward(self, state, action):
        if(state in [(0,2),(1,2),(2,3)]):
            if (state == (0,2) and action == "RIGHT") :
                return 1
            elif state == (1,2) and action == "RIGHT":
                return -1
            elif state == (2,3) and action == "UP":
                return -1
        return 0
    
    def next_state(self,state,action):
        action = action.upper()
        x , y = state
        # Update state based on action
        if action == "UP":
            new_state = (x - 1, y)
        elif action == "DOWN":
            new_state = (x + 1, y)
        elif action == "RIGHT":
            new_state = (x, y + 1)
        elif action == "LEFT":
            new_state = (x, y - 1)
        else:
            raise ValueError("Invalid action")
        
        if self.is_valid_state(new_state):
            return new_state
        return state


    def step(self, action):
        action = action.upper()
        x, y = self.current_state

        # Update state based on action
        if action == "UP":
            new_state = (x - 1, y)
        elif action == "DOWN":
            new_state = (x + 1, y)
        elif action == "RIGHT":
            new_state = (x, y + 1)
        elif action == "LEFT":
            new_state = (x, y - 1)
        else:
            raise ValueError("Invalid action")

        #Info
        info = {"Previous state":self.current_state, "Action":action, "Next state": new_state, "Reward": 0, "Done": False}

        # Check for valid state and update
        reward = 0
        if self.is_valid_state(new_state):
            self.current_state = new_state
            if self.is_terminal():
                reward = 1 if self.current_state == (0, 3) else -1  # Goal 1 reward or failure penalty

        done = self.is_terminal()
        #Update info
        info["Reward"] = reward
        info["Done"] = done
        

        # Return observation, reward, done, and optionally info dictionary
        return self.current_state, reward, done, info  # Empty info for now
    

In [4]:
# Test Task 1:
gw = GridWorld()
policy = {(0,0):"Right", (0,1):"Right", (0,2):"Right",(1,0):"Up", (1,2):"Up", (2,0):"Up",(2,1):"Right",(2,2):"Up", (2,3): "Left"}
gw.reset()
gw.current_state = (2,1) # Change Start state
done = gw.is_terminal()
while not done:
    action = policy[gw.current_state]
    new_state, reward, done, info = gw.step(action)
    print(info.values())

dict_values([(2, 1), 'RIGHT', (2, 2), 0, False])
dict_values([(2, 2), 'UP', (1, 2), 0, False])
dict_values([(1, 2), 'UP', (0, 2), 0, False])
dict_values([(0, 2), 'RIGHT', (0, 3), 1, True])


# Task 2: a) calculate the state-value function

In [20]:
def state_value_function(gamma=0.9):
    # Assume V(0,3) = V(1,3) = V(1,1) = 0 
    index = {(0,0):0,(0,1):1,(0,2):2,(1,0):3,(1,2):4,(2,0):5,(2,1):6,(2,2):7,(2,3):8}

    flipped_index = {value: key for key, value in index.items()}

    A = np.zeros((9,9))
    B = np.zeros((9,1))

    eq1 = {(2,0):1-0.5*gamma, (2,1): -0.25*gamma, (1,0): -0.25*gamma}
    eq2 = {(1,0):1-0.5*gamma, (0,0): -0.25*gamma, (2,0): -0.25*gamma}
    eq3 = {(0,0):1-0.5*gamma, (0,1): -0.25*gamma, (1,0): -0.25*gamma}
    eq4 = {(0,1):1-0.5*gamma, (0,2): -0.25*gamma, (0,0): -0.25*gamma}
    eq5 = {(0,2):1-0.25*gamma, (1,2): -0.25*gamma, (0,1): -0.25*gamma, "B": 0.25} # B = 0.25
    eq6 = {(2,1):1-0.5*gamma, (2,2): -0.25*gamma, (2,0): -0.25*gamma}
    eq7 = {(2,2):1-0.25*gamma, (1,2): -0.25*gamma, (2,3): -0.25*gamma, (2,1): -0.25*gamma}
    eq8 = {(2,3):1-0.5*gamma, (2,2): -0.25*gamma, "B": -0.25} # B = -0.25
    eq9 = {(1,2):1-0.25*gamma, (0,2): -0.25*gamma, (2,2): -0.25*gamma, "B": -0.25} # B = -0.25


    equations = [eq1, eq2, eq3, eq4, eq5, eq6, eq7, eq8, eq9]

    for i, eq in enumerate(equations):
        for key,value in eq.items():
            if(key == "B"):
                B[i] = value
            else:
                A[i][index[key]] = value

    V = np.linalg.solve(A, B)
    V = V.ravel()
    V_dic = {kk: V[i] for i,kk in enumerate(index.keys())}
    V_dic[(1,1)] = 0
    V_dic[(0,3)] = 0
    V_dic[(1,3)] = 0

    map = np.zeros((3,4))

    for i in range(9):
        x,y = flipped_index[i]
        map[x][y] = V[i]

    return map,V_dic

In [21]:
print(state_value_function(0.9)[0])

[[ 0.04919829  0.12715279  0.26161963  0.        ]
 [-0.00689031  0.         -0.3371296   0.        ]
 [-0.06604127 -0.15454389 -0.3117327  -0.58207247]]


In [22]:
print(state_value_function(0.9)[1])

{(0, 0): 0.04919828548334439, (0, 1): 0.12715278556446113, (0, 2): 0.2616196347853384, (1, 0): -0.006890309938508186, (1, 2): -0.337129599081629, (2, 0): -0.06604126533303105, (2, 1): -0.15454389420890102, (2, 2): -0.311732698288727, (2, 3): -0.5820724674817519, (1, 1): 0, (0, 3): 0, (1, 3): 0}


# Action Value Function

In [26]:
def action_value_function(env, gamma=0.9):
    all_states = tuple((x, y) for x in range(3) for y in range(4))
    actions = ("UP", "DOWN", "RIGHT", "LEFT")

    _, V_dic = state_value_function(gamma)
        
    Q = {}
    for state in all_states:
        if state in ( (1,1), (0,3), (1,3) ):
            Q[state] = [0.0 , 0.0 , 0.0 ,0.0 ,]
            continue
        temp = []
        for action in actions:
            ret = env.reward(state, action) + gamma * V_dic[env.next_state(state, action)]
            temp.append(round(ret, 2))
        Q[state] = temp  # ORDER: UP, DOWN, RIGHT, LEFT

    return Q # {state: [return(action = UP), return(action = DOWN), return(action = RIGHT), return(action = LEFT)]}


In [31]:
gw = GridWorld()
print(" State : [ UP  | DOWN | RIGHT | LEFT ]")
action_value_function(gw , 0.9 )

 State : [ UP  | DOWN | RIGHT | LEFT ]


{(0, 0): [0.04, -0.01, 0.11, 0.04],
 (0, 1): [0.11, 0.11, 0.24, 0.04],
 (0, 2): [0.24, -0.3, 1.0, 0.11],
 (0, 3): [0.0, 0.0, 0.0, 0.24],
 (1, 0): [0.04, -0.06, -0.01, -0.01],
 (1, 1): [0.11, -0.14, -0.3, -0.01],
 (1, 2): [0.24, -0.28, -1.0, -0.3],
 (1, 3): [0.0, -0.52, 0.0, -0.3],
 (2, 0): [-0.01, -0.06, -0.14, -0.06],
 (2, 1): [-0.14, -0.14, -0.28, -0.06],
 (2, 2): [-0.3, -0.28, -0.52, -0.14],
 (2, 3): [-1.0, -0.52, -0.52, -0.28]}