Installing dependencies

In [None]:
! pip3 install -e .

# Q1 Dynamic Programming
- To find optimal policies for Markov Decision Processes (MDPs)
- Implement the Policy Iteration (PI) and Value Iteration (VI) algorithms (functions).
- For each algorithm, you can find the functions that you need to implement under Tasks below. 
- Read the code documentation to understand the input and required outputs of these functions. 
- We will mark your submission only based on the correctness of the outputs of these functions.

In [None]:
# from rl2023.exercise1.mdp_solver import ValueIteration, PolicyIteration
from rl2023.exercise1.mdp import MDP, Transition, State, Action
from rl2023 import constants
import numpy as np

In [None]:
constants.EX1_CONSTANTS

In [None]:
def calculate_state_values(policy, rewards, gamma=0.9, theta=0.01):
    """
  Args:
      policy (np.array): Policy giving the probability of taking each action from each state.
      rewards (np.array): Rewards corresponding to reaching the next state from each state (so r(s,s') here rather than the usual r(s,a)).
      gamma (float, optional): Discount factor. Defaults to 0.9.
      theta (float, optional): Minimum value change threshold to terminate iteration. Defaults to 0.01.

  Returns:
      values (np.array): State values for given policy and reward mappings.
  """    
    num_states, num_actions = policy.shape # 2, 3
    values = np.zeros((num_states+1)) # Include terminal state
    
    print(f'Beginning policy evaluation for given policy and MDP...\n')
    iteration = 1

    while True:
        print(f'Iteration: {iteration} \t Current Values: {values}')
        delta=0
        initial_values = values
        values = np.zeros_like(values)
        for state in range(num_states):
            for action in range(num_actions):
                # With probability 0.9, next state = action with corresponding reward. With probability 0.1, next state = state with no reward.
                next_state_probabilities = {action:0.9, state:0.1}
                # Often we leave this next state computation (transition dynamics) to the environment. But to use complete dynamic programming we must know 
                # the transition probablities.
                for next_state in next_state_probabilities.keys():
                  # Note the expectation here is over both the policy and next state probabilities (environment dynamics)
                    aa = policy[state][action]*next_state_probabilities[next_state] * (rewards[state][next_state] + gamma * initial_values[next_state])
                    values[state] += aa
#                     print(aa)

            delta = max(delta, abs(initial_values[state]-values[state]))

        if delta < theta:
            print(f'\nMax difference in state value from previous iteration = {delta} which is less than threshold {theta}. Policy Evaluation terminating...\n')
            break

        iteration+=1

    print(f'Final policy state values: {values}')
    return values

In [None]:
def calculate_greedy_policy(policy, rewards, values, valid_actions, gamma=0.9):
    """Improve policy (take greedy actions) with respect to rewards and state values.

  Args:
      policy (np.array): Policy giving the probability of taking each action from each state.
      rewards (np.array): Rewards corresponding to reaching the next state from each state (so r(s,s') here rather than the usual r(s,a)).
      values (np.array): State values evaluated for current policy.
      gamma (float, optional): Discount factor. Defaults to 0.9.

  Returns:
      policy (np.array): Improved (greedy) policy.
  """

    num_states, num_actions = policy.shape # 2, 3
    greedy_policy = np.zeros_like(policy)
    state_action_values = np.nan*policy
    for action in range(num_actions):
        for state in range(num_states):
            if valid_actions[state, action]==1:
                state_action_values[state][action] = 0.0
                # With probability 0.9, next state = action with corresponding reward. With probability 0.1, next state = state with no reward.
                next_state_probabilities = {action:0.9, state:0.1}
                # Often the environment handles this next state computation (transition dynamics). But to use complete dynamic programming we must know 
                # the transition probablities.
        for next_state in next_state_probabilities.keys():
          # Note the expectation is now only over next state probabilities (environment dynamics) since we want the value for each valid state and action.
            state_action_values[state][action] += next_state_probabilities[next_state]*(rewards[state][next_state]+gamma*values[next_state])
            
    
    greedy_action = np.nanargmax(state_action_values[state]) # Argmax ignoring invalid actions with nan value
  
    for action in range(num_actions):
        greedy_policy[state][action] = 1 if action == greedy_action else 0

    print(f'State action values with previous policy:\n {state_action_values}\n')
    print(f'Greedy policy after policy improvement:\n {greedy_policy}')
    return greedy_policy

In [None]:
# state values
valid_actions = np.array([[0,1,0],[1,0,1]])

policy = np.array([[0,1,0],[0.5,0,0.5]])

rewards = np.array([[0,0,0],[0,0,10]])
#         policy = np.array([[1,0,0],[0,1,0],[0.5,0,0.5]])

###############################################################

# algorithm
values = calculate_state_values(policy, rewards, gamma=0.9, theta=0.01)

policy = calculate_greedy_policy(policy, rewards, values, valid_actions, gamma=0.9)

# values = calculate_state_values(policy, rewards, gamma=0.9, theta=0.01)

# policy = calculate_greedy_policy(policy, rewards, values, valid_actions, gamma=0.9)

In [None]:
class ValueIteration(MDPSolver):
    """MDP solver using the Value Iteration algorithm
    """

    def _calc_value_func(self, theta: float) -> np.ndarray:
        """Calculates the value function

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q1**

        **DO NOT ALTER THE MDP HERE**

        Useful Variables:
        1. `self.mpd` -- Gives access to the MDP.
        2. `self.mdp.R` -- 3D NumPy array with the rewards for each transition.
            E.g. the reward of transition [3] -2-> [4] (going from state 3 to state 4 with action
            2) can be accessed with `self.R[3, 2, 4]`
        3. `self.mdp.P` -- 3D NumPy array with transition probabilities.
            *REMEMBER*: the sum of (STATE, ACTION, :) should be 1.0 (all actions lead somewhere)
            E.g. the transition probability of transition [3] -2-> [4] (going from state 3 to
            state 4 with action 2) can be accessed with `self.P[3, 2, 4]`

        :param theta (float): theta is the stop threshold for value iteration
        :return (np.ndarray of float with dim (num of states)):
            1D NumPy array with the values of each state.
            E.g. V[3] returns the computed value for state 3
        """
        V = np.zeros(self.state_dim)
        ### PUT YOUR CODE HERE ###
        # for each state, find the a which is maximum
        num_states, num_actions = self.state_dim, self.action_dim 
        rewards = self.mdp.R # access rewards 
        gamma = self.gamma
        delta = 0
        policies = np.zeros(num_states)
        while delta < theta:
            for s in range(num_states):
                v = V[s].copy()
                # initialise an array for each iteration (state), 
                # which consists of the possible values for each action 
                find_max = np.zeros(num_actions) 
                for a in range(num_actions): 
                    for ns in range(num_states):
                        next_state_probabilities = self.mdp.P[s, a, ns]
                        find_max[a] += next_state_probabilities*(rewards[s, a, ns] + gamma * V[ns])
                V[s] = max(find_max)
                policies[s] = np.nanargmax(find_max)
                
            delta = max(delta, abs(v - V[s]))
        
#         raise NotImplementedError("Needed for Q1")
        return V

    def _calc_policy(self, V: np.ndarray) -> np.ndarray:
        """Calculates the policy

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q1**

        :param V (np.ndarray of float with dim (num of states)):
            A 1D NumPy array that encodes the computed value function (from _calc_value_func(...))
            It is indexed as (State) where V[State] is the value of state 'State'
        :return (np.ndarray of float with dim (num of states, num of actions):
            A 2D NumPy array that encodes the calculated policy.
            It is indexed as (STATE, ACTION) where policy[STATE, ACTION] has the probability of
            taking action 'ACTION' in state 'STATE'.
            REMEMBER: the sum of policy[STATE, :] should always be 1.0
            For deterministic policies the following holds for each state S:
            policy[S, BEST_ACTION] = 1.0
            policy[S, OTHER_ACTIONS] = 0
        """
        policy = np.zeros([self.state_dim, self.action_dim])
        ### PUT YOUR CODE HERE ###
        num_states, num_actions = self.state_dim, self.action_dim 
        rewards = self.mdp.R # access rewards 
        gamma = self.gamma
# #         raise NotImplementedError("Needed for Q1")
        for s in range(num_states):
            find_max = np.zeros(num_actions) 
            for a in range(num_actions): 
                for ns in range(num_states):
                    next_state_probabilities = self.mdp.P[s, a, ns]
                    find_max[a] += next_state_probabilities*(rewards[s, a, ns] + gamma * V[ns])
            for i in range(len(find_max)):
                if find_max[i] == V[s]:
                    policy[s] = i
            
        return policy

    def solve(self, theta: float = 1e-6) -> Tuple[np.ndarray, np.ndarray]:
        """Solves the MDP

        Compiles the MDP and then calls the calc_value_func and
        calc_policy functions to return the best policy and the
        computed value function

        **DO NOT CHANGE THIS FUNCTION**

        :param theta (float, optional): stop threshold, defaults to 1e-6
        :return (Tuple[np.ndarray of float with dim (num of states, num of actions),
                       np.ndarray of float with dim (num of states)):
            Tuple of calculated policy and value function
        """
        
        self.mdp.ensure_compiled()
        V = self._calc_value_func(theta)
        policy = self._calc_policy(V)

        return policy, V


In [2]:
class PolicyIteration(MDPSolver):
    """MDP solver using the Policy Iteration algorithm
    """

    def _policy_eval(self, policy: np.ndarray) -> np.ndarray:
        """Computes one policy evaluation step

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q1**

        :param policy (np.ndarray of float with dim (num of states, num of actions)):
            A 2D NumPy array that encodes the policy.
            It is indexed as (STATE, ACTION) where policy[STATE, ACTION] has the probability of
            taking action 'ACTION' in state 'STATE'.
            REMEMBER: the sum of policy[STATE, :] should always be 1.0
            For deterministic policies the following holds for each state S:
            policy[S, BEST_ACTION] = 1.0
            policy[S, OTHER_ACTIONS] = 0
        :return (np.ndarray of float with dim (num of states)): 
            A 1D NumPy array that encodes the computed value function
            It is indexed as (State) where V[State] is the value of state 'State'
        """
        ### PUT YOUR CODE HERE ###
        # initilise V(s)
        theta = 0.01 # self.theta
        num_states, num_actions = self.state_dim, self.action_dim 
    
        V = np.zeros(num_states)
        gamma = 0.9 #self.gamma # obtain gamma value
        
        rewards = self.mdp.R # access rewards 

        
        print(f'Beginning policy evaluation for given policy and MDP...\n')
        iteration = 1

        while True:
            print(f'Iteration: {iteration} \t Current Values: {V}')
            delta = 0
            # set v = V
            initial_values = V.copy()
            for s in range(num_states):
                for a in range(num_actions):
                    for ns in range(num_states):
                        next_state_probabilities = self.mdp.P[s, a, ns] 
                        V[s] += policy[s][a]*next_state_probabilities*(rewards[s, a, ns] + gamma * initial_values[ns])

                delta = max(delta, abs(initial_values[s]-V[s]))
        
            if delta < theta:
                print(f'\nMax difference in state value from previous iteration = {delta} which is less than threshold {theta}. Policy Evaluation terminating...\n')
                break

            iteration+=1

            print(f'Final policy state values: {V}')
        
        
        return np.array(V)

    def _policy_improvement(self) -> Tuple[np.ndarray, np.ndarray]:
        """Computes policy iteration until a stable policy is reached

        **YOU MUST IMPLEMENT THIS FUNCTION FOR Q1**

        Useful Variables (As with Value Iteration):
        1. `self.mpd` -- Gives access to the MDP.
        2. `self.mdp.R` -- 3D NumPy array with the rewards for each transition.
            E.g. the reward of transition [3] -2-> [4] (going from state 3 to state 4 with action
            2) can be accessed with `self.R[3, 2, 4]`
        3. `self.mdp.P` -- 3D NumPy array with transition probabilities.
            *REMEMBER*: the sum of (STATE, ACTION, :) should be 1.0 (all actions lead somewhere)
            E.g. the transition probability of transition [3] -2-> [4] (going from state 3 to
            state 4 with action 2) can be accessed with `self.P[3, 2, 4]`

        :return (Tuple[np.ndarray of float with dim (num of states, num of actions),
                       np.ndarray of float with dim (num of states)):
            Tuple of calculated policy and value function
        """
        num_states, num_actions = self.state_dim, self.action_dim 
        policy = np.zeros([num_states, num_actions])
        policy = np.array([[1,0,0],[0,1,0],[0.5,0,0.5]])
        V = np.zeros([num_states])
        ### PUT YOUR CODE HERE ###
        gamma = 0.9 #self.gamma 
        rewards = self.mdp.R # access rewards 
        policy_stable = True
        optimal = False
        
        while optimal == False:

            # evaluate policy
            V = self._policy_eval(policy)

            for s in range(num_states):
                action = policy[s].copy()
                policy_max = np.zeros(num_actions)
                for a in range(num_actions): ### in action?
                    for ns in range(num_states):
                        next_state_probabilities = self.mdp.P[s, a, ns] 
                        policy_max[a] += next_state_probabilities*(rewards[s, a, ns] + gamma * V[ns])

                # for each state, take the action which leads to maximum policy[s]
                policy[s] = np.nanargmax(policy_max)

                if (action != policy[s]).any():
                    policy_stable = False
                    break

            if policy_stable:
                optimal = True
                break

        if optimal == True:
            return V, policy

    def solve(self, theta: float = 1e-6) -> Tuple[np.ndarray, np.ndarray]:
        """Solves the MDP

        This function compiles the MDP and then calls the
        policy improvement function that the student must implement
        and returns the solution

        **DO NOT CHANGE THIS FUNCTION**

        :param theta (float, optional): stop threshold, 
        defaults to 1e-6
        :return (Tuple[np.ndarray of float with dim (num of states, num of actions),
                       np.ndarray of float with dim (num of states)]):
            Tuple of calculated policy and value function
        """
        self.mdp.ensure_compiled()
        self.theta = theta
        return self._policy_improvement()

In [3]:
mdp = MDP()
mdp.add_transition(
    #         start action end prob reward
    Transition("rock0", "jump0", "rock0", 1, 0),
    Transition("rock0", "stay", "rock0", 1, 0),
    Transition("rock0", "jump1", "rock0", 0.1, 0),
    Transition("rock0", "jump1", "rock1", 0.9, 0),
    Transition("rock1", "jump0", "rock1", 0.1, 0),
    Transition("rock1", "jump0", "rock0", 0.9, 0),
    Transition("rock1", "jump1", "rock1", 0.1, 0),
    Transition("rock1", "jump1", "land", 0.9, 10),
    Transition("rock1", "stay", "rock1", 1, 0),
    Transition("land", "stay", "land", 1, 0),
    Transition("land", "jump0", "land", 1, 0),
    Transition("land", "jump1", "land", 1, 0),
)
# solver = ValueIteration(mdp, CONSTANTS["gamma"])
# policy, valuefunc = solver.solve()
# print("---Value Iteration---")
# print("Policy:")
# print(solver.decode_policy(policy))
# print("Value Function")
# print(valuefunc)

solver = PolicyIteration(mdp, CONSTANTS["gamma"])
policy, valuefunc = solver.solve()
print("---Policy Iteration---")
print("Policy:")
print(solver.decode_policy(policy))
print("Value Function")
print(valuefunc)

Beginning policy evaluation for given policy and MDP...

Iteration: 1 	 Current Values: [0. 0. 0.]
Final policy state values: [0. 9. 0.]
Iteration: 2 	 Current Values: [0. 9. 0.]
Final policy state values: [ 0.   18.81  0.  ]
Iteration: 3 	 Current Values: [ 0.   18.81  0.  ]
Final policy state values: [ 0.     29.5029  0.    ]
Iteration: 4 	 Current Values: [ 0.     29.5029  0.    ]
Final policy state values: [ 0.       41.158161  0.      ]
Iteration: 5 	 Current Values: [ 0.       41.158161  0.      ]
Final policy state values: [ 0.         53.86239549  0.        ]
Iteration: 6 	 Current Values: [ 0.         53.86239549  0.        ]
Final policy state values: [ 0.         67.71001108  0.        ]
Iteration: 7 	 Current Values: [ 0.         67.71001108  0.        ]
Final policy state values: [ 0.         82.80391208  0.        ]
Iteration: 8 	 Current Values: [ 0.         82.80391208  0.        ]
Final policy state values: [ 0.         99.25626417  0.        ]
Iteration: 9 	 Current V

Final policy state values: [0.00000000e+00 8.48276427e+12 0.00000000e+00]
Iteration: 293 	 Current Values: [0.00000000e+00 8.48276427e+12 0.00000000e+00]
Final policy state values: [0.00000000e+00 9.24621306e+12 0.00000000e+00]
Iteration: 294 	 Current Values: [0.00000000e+00 9.24621306e+12 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.00783722e+13 0.00000000e+00]
Iteration: 295 	 Current Values: [0.00000000e+00 1.00783722e+13 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.09854257e+13 0.00000000e+00]
Iteration: 296 	 Current Values: [0.00000000e+00 1.09854257e+13 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.19741141e+13 0.00000000e+00]
Iteration: 297 	 Current Values: [0.00000000e+00 1.19741141e+13 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.30517843e+13 0.00000000e+00]
Iteration: 298 	 Current Values: [0.00000000e+00 1.30517843e+13 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.42264449e+13 0.00000000e+00]
It

Iteration: 634 	 Current Values: [0.00000000e+00 4.90877579e+25 0.00000000e+00]
Final policy state values: [0.00000000e+00 5.35056562e+25 0.00000000e+00]
Iteration: 635 	 Current Values: [0.00000000e+00 5.35056562e+25 0.00000000e+00]
Final policy state values: [0.00000000e+00 5.83211652e+25 0.00000000e+00]
Iteration: 636 	 Current Values: [0.00000000e+00 5.83211652e+25 0.00000000e+00]
Final policy state values: [0.00000000e+00 6.35700701e+25 0.00000000e+00]
Iteration: 637 	 Current Values: [0.00000000e+00 6.35700701e+25 0.00000000e+00]
Final policy state values: [0.00000000e+00 6.92913764e+25 0.00000000e+00]
Iteration: 638 	 Current Values: [0.00000000e+00 6.92913764e+25 0.00000000e+00]
Final policy state values: [0.00000000e+00 7.55276003e+25 0.00000000e+00]
Iteration: 639 	 Current Values: [0.00000000e+00 7.55276003e+25 0.00000000e+00]
Final policy state values: [0.00000000e+00 8.23250843e+25 0.00000000e+00]
Iteration: 640 	 Current Values: [0.00000000e+00 8.23250843e+25 0.00000000e+

Iteration: 959 	 Current Values: [0.00000000e+00 7.15459455e+37 0.00000000e+00]
Final policy state values: [0.00000000e+00 7.79850805e+37 0.00000000e+00]
Iteration: 960 	 Current Values: [0.00000000e+00 7.79850805e+37 0.00000000e+00]
Final policy state values: [0.00000000e+00 8.50037378e+37 0.00000000e+00]
Iteration: 961 	 Current Values: [0.00000000e+00 8.50037378e+37 0.00000000e+00]
Final policy state values: [0.00000000e+00 9.26540742e+37 0.00000000e+00]
Iteration: 962 	 Current Values: [0.00000000e+00 9.26540742e+37 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.00992941e+38 0.00000000e+00]
Iteration: 963 	 Current Values: [0.00000000e+00 1.00992941e+38 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.10082306e+38 0.00000000e+00]
Iteration: 964 	 Current Values: [0.00000000e+00 1.10082306e+38 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.19989713e+38 0.00000000e+00]
Iteration: 965 	 Current Values: [0.00000000e+00 1.19989713e+38 0.00000000e+

Iteration: 1189 	 Current Values: [0.0000000e+00 2.9018807e+46 0.0000000e+00]
Final policy state values: [0.00000000e+00 3.16304996e+46 0.00000000e+00]
Iteration: 1190 	 Current Values: [0.00000000e+00 3.16304996e+46 0.00000000e+00]
Final policy state values: [0.00000000e+00 3.44772445e+46 0.00000000e+00]
Iteration: 1191 	 Current Values: [0.00000000e+00 3.44772445e+46 0.00000000e+00]
Final policy state values: [0.00000000e+00 3.75801966e+46 0.00000000e+00]
Iteration: 1192 	 Current Values: [0.00000000e+00 3.75801966e+46 0.00000000e+00]
Final policy state values: [0.00000000e+00 4.09624142e+46 0.00000000e+00]
Iteration: 1193 	 Current Values: [0.00000000e+00 4.09624142e+46 0.00000000e+00]
Final policy state values: [0.00000000e+00 4.46490315e+46 0.00000000e+00]
Iteration: 1194 	 Current Values: [0.00000000e+00 4.46490315e+46 0.00000000e+00]
Final policy state values: [0.00000000e+00 4.86674444e+46 0.00000000e+00]
Iteration: 1195 	 Current Values: [0.00000000e+00 4.86674444e+46 0.000000

Final policy state values: [0.00000000e+00 1.33228208e+53 0.00000000e+00]
Iteration: 1367 	 Current Values: [0.00000000e+00 1.33228208e+53 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.45218747e+53 0.00000000e+00]
Iteration: 1368 	 Current Values: [0.00000000e+00 1.45218747e+53 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.58288434e+53 0.00000000e+00]
Iteration: 1369 	 Current Values: [0.00000000e+00 1.58288434e+53 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.72534393e+53 0.00000000e+00]
Iteration: 1370 	 Current Values: [0.00000000e+00 1.72534393e+53 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.88062489e+53 0.00000000e+00]
Iteration: 1371 	 Current Values: [0.00000000e+00 1.88062489e+53 0.00000000e+00]
Final policy state values: [0.00000000e+00 2.04988113e+53 0.00000000e+00]
Iteration: 1372 	 Current Values: [0.00000000e+00 2.04988113e+53 0.00000000e+00]
Final policy state values: [0.00000000e+00 2.23437043e+53 0.00000000e+

Final policy state values: [0.0000000e+00 6.5774742e+62 0.0000000e+00]
Iteration: 1626 	 Current Values: [0.0000000e+00 6.5774742e+62 0.0000000e+00]
Final policy state values: [0.00000000e+00 7.16944688e+62 0.00000000e+00]
Iteration: 1627 	 Current Values: [0.00000000e+00 7.16944688e+62 0.00000000e+00]
Final policy state values: [0.0000000e+00 7.8146971e+62 0.0000000e+00]
Iteration: 1628 	 Current Values: [0.0000000e+00 7.8146971e+62 0.0000000e+00]
Final policy state values: [0.00000000e+00 8.51801984e+62 0.00000000e+00]
Iteration: 1629 	 Current Values: [0.00000000e+00 8.51801984e+62 0.00000000e+00]
Final policy state values: [0.00000000e+00 9.28464163e+62 0.00000000e+00]
Iteration: 1630 	 Current Values: [0.00000000e+00 9.28464163e+62 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.01202594e+63 0.00000000e+00]
Iteration: 1631 	 Current Values: [0.00000000e+00 1.01202594e+63 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.10310827e+63 0.00000000e+00]
Iteratio

Final policy state values: [0.00000000e+00 1.09995079e+70 0.00000000e+00]
Iteration: 1819 	 Current Values: [0.00000000e+00 1.09995079e+70 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.19894636e+70 0.00000000e+00]
Iteration: 1820 	 Current Values: [0.00000000e+00 1.19894636e+70 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.30685153e+70 0.00000000e+00]
Iteration: 1821 	 Current Values: [0.00000000e+00 1.30685153e+70 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.42446817e+70 0.00000000e+00]
Iteration: 1822 	 Current Values: [0.00000000e+00 1.42446817e+70 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.55267031e+70 0.00000000e+00]
Iteration: 1823 	 Current Values: [0.00000000e+00 1.55267031e+70 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.69241064e+70 0.00000000e+00]
Iteration: 1824 	 Current Values: [0.00000000e+00 1.69241064e+70 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.84472759e+70 0.00000000e+

Iteration: 2141 	 Current Values: [0.00000000e+00 1.23795709e+82 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.34937323e+82 0.00000000e+00]
Iteration: 2142 	 Current Values: [0.00000000e+00 1.34937323e+82 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.47081682e+82 0.00000000e+00]
Iteration: 2143 	 Current Values: [0.00000000e+00 1.47081682e+82 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.60319034e+82 0.00000000e+00]
Iteration: 2144 	 Current Values: [0.00000000e+00 1.60319034e+82 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.74747747e+82 0.00000000e+00]
Iteration: 2145 	 Current Values: [0.00000000e+00 1.74747747e+82 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.90475044e+82 0.00000000e+00]
Iteration: 2146 	 Current Values: [0.00000000e+00 1.90475044e+82 0.00000000e+00]
Final policy state values: [0.00000000e+00 2.07617798e+82 0.00000000e+00]
Iteration: 2147 	 Current Values: [0.00000000e+00 2.07617798e+82 0.000

Final policy state values: [0.00000000e+00 1.80433604e+94 0.00000000e+00]
Iteration: 2466 	 Current Values: [0.00000000e+00 1.80433604e+94 0.00000000e+00]
Final policy state values: [0.00000000e+00 1.96672628e+94 0.00000000e+00]
Iteration: 2467 	 Current Values: [0.00000000e+00 1.96672628e+94 0.00000000e+00]
Final policy state values: [0.00000000e+00 2.14373165e+94 0.00000000e+00]
Iteration: 2468 	 Current Values: [0.00000000e+00 2.14373165e+94 0.00000000e+00]
Final policy state values: [0.0000000e+00 2.3366675e+94 0.0000000e+00]
Iteration: 2469 	 Current Values: [0.0000000e+00 2.3366675e+94 0.0000000e+00]
Final policy state values: [0.00000000e+00 2.54696757e+94 0.00000000e+00]
Iteration: 2470 	 Current Values: [0.00000000e+00 2.54696757e+94 0.00000000e+00]
Final policy state values: [0.00000000e+00 2.77619465e+94 0.00000000e+00]
Iteration: 2471 	 Current Values: [0.00000000e+00 2.77619465e+94 0.00000000e+00]
Final policy state values: [0.00000000e+00 3.02605217e+94 0.00000000e+00]
It

Iteration: 2705 	 Current Values: [0.00000000e+000 1.58946303e+103 0.00000000e+000]
Final policy state values: [0.0000000e+000 1.7325147e+103 0.0000000e+000]
Iteration: 2706 	 Current Values: [0.0000000e+000 1.7325147e+103 0.0000000e+000]
Final policy state values: [0.00000000e+000 1.88844103e+103 0.00000000e+000]
Iteration: 2707 	 Current Values: [0.00000000e+000 1.88844103e+103 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.05840072e+103 0.00000000e+000]
Iteration: 2708 	 Current Values: [0.00000000e+000 2.05840072e+103 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.24365679e+103 0.00000000e+000]
Iteration: 2709 	 Current Values: [0.00000000e+000 2.24365679e+103 0.00000000e+000]
Final policy state values: [0.0000000e+000 2.4455859e+103 0.0000000e+000]
Iteration: 2710 	 Current Values: [0.0000000e+000 2.4455859e+103 0.0000000e+000]
Final policy state values: [0.00000000e+000 2.66568863e+103 0.00000000e+000]
Iteration: 2711 	 Current Values: [0.00000000e

Iteration: 2961 	 Current Values: [0.00000000e+000 6.05945974e+112 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.60481112e+112 0.00000000e+000]
Iteration: 2962 	 Current Values: [0.00000000e+000 6.60481112e+112 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.19924412e+112 0.00000000e+000]
Iteration: 2963 	 Current Values: [0.00000000e+000 7.19924412e+112 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.84717609e+112 0.00000000e+000]
Iteration: 2964 	 Current Values: [0.00000000e+000 7.84717609e+112 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.55342193e+112 0.00000000e+000]
Iteration: 2965 	 Current Values: [0.00000000e+000 8.55342193e+112 0.00000000e+000]
Final policy state values: [0.00000000e+000 9.32322991e+112 0.00000000e+000]
Iteration: 2966 	 Current Values: [0.00000000e+000 9.32322991e+112 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.01623206e+113 0.00000000e+000]
Iteration: 2967 	 Current Values: 

Iteration: 3107 	 Current Values: [0.00000000e+000 1.76482889e+118 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.92366349e+118 0.00000000e+000]
Iteration: 3108 	 Current Values: [0.00000000e+000 1.92366349e+118 0.00000000e+000]
Final policy state values: [0.0000000e+000 2.0967932e+118 0.0000000e+000]
Iteration: 3109 	 Current Values: [0.0000000e+000 2.0967932e+118 0.0000000e+000]
Final policy state values: [0.00000000e+000 2.28550459e+118 0.00000000e+000]
Iteration: 3110 	 Current Values: [0.00000000e+000 2.28550459e+118 0.00000000e+000]
Final policy state values: [0.0000e+000 2.4912e+118 0.0000e+000]
Iteration: 3111 	 Current Values: [0.0000e+000 2.4912e+118 0.0000e+000]
Final policy state values: [0.000000e+000 2.715408e+118 0.000000e+000]
Iteration: 3112 	 Current Values: [0.000000e+000 2.715408e+118 0.000000e+000]
Final policy state values: [0.00000000e+000 2.95979472e+118 0.00000000e+000]
Iteration: 3113 	 Current Values: [0.00000000e+000 2.95979472e+118 0.0000000

Final policy state values: [0.00000000e+000 3.91572014e+126 0.00000000e+000]
Iteration: 3330 	 Current Values: [0.00000000e+000 3.91572014e+126 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.26813495e+126 0.00000000e+000]
Iteration: 3331 	 Current Values: [0.00000000e+000 4.26813495e+126 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.65226709e+126 0.00000000e+000]
Iteration: 3332 	 Current Values: [0.00000000e+000 4.65226709e+126 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.07097113e+126 0.00000000e+000]
Iteration: 3333 	 Current Values: [0.00000000e+000 5.07097113e+126 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.52735853e+126 0.00000000e+000]
Iteration: 3334 	 Current Values: [0.00000000e+000 5.52735853e+126 0.00000000e+000]
Final policy state values: [0.0000000e+000 6.0248208e+126 0.0000000e+000]
Iteration: 3335 	 Current Values: [0.0000000e+000 6.0248208e+126 0.0000000e+000]
Final policy state values: [0.00000000e+

Final policy state values: [0.00000000e+000 6.54825747e+133 0.00000000e+000]
Iteration: 3523 	 Current Values: [0.00000000e+000 6.54825747e+133 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.13760064e+133 0.00000000e+000]
Iteration: 3524 	 Current Values: [0.00000000e+000 7.13760064e+133 0.00000000e+000]
Final policy state values: [0.0000000e+000 7.7799847e+133 0.0000000e+000]
Iteration: 3525 	 Current Values: [0.0000000e+000 7.7799847e+133 0.0000000e+000]
Final policy state values: [0.00000000e+000 8.48018332e+133 0.00000000e+000]
Iteration: 3526 	 Current Values: [0.00000000e+000 8.48018332e+133 0.00000000e+000]
Final policy state values: [0.00000000e+000 9.24339982e+133 0.00000000e+000]
Iteration: 3527 	 Current Values: [0.00000000e+000 9.24339982e+133 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.00753058e+134 0.00000000e+000]
Iteration: 3528 	 Current Values: [0.00000000e+000 1.00753058e+134 0.00000000e+000]
Final policy state values: [0.00000000e+

Final policy state values: [0.00000000e+000 1.81183511e+144 0.00000000e+000]
Iteration: 3802 	 Current Values: [0.00000000e+000 1.81183511e+144 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.97490027e+144 0.00000000e+000]
Iteration: 3803 	 Current Values: [0.00000000e+000 1.97490027e+144 0.00000000e+000]
Final policy state values: [0.0000000e+000 2.1526413e+144 0.0000000e+000]
Iteration: 3804 	 Current Values: [0.0000000e+000 2.1526413e+144 0.0000000e+000]
Final policy state values: [0.00000000e+000 2.34637902e+144 0.00000000e+000]
Iteration: 3805 	 Current Values: [0.00000000e+000 2.34637902e+144 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.55755313e+144 0.00000000e+000]
Iteration: 3806 	 Current Values: [0.00000000e+000 2.55755313e+144 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.78773291e+144 0.00000000e+000]
Iteration: 3807 	 Current Values: [0.00000000e+000 2.78773291e+144 0.00000000e+000]
Final policy state values: [0.00000000e+

Iteration: 3986 	 Current Values: [0.00000000e+000 1.39506452e+151 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.52062033e+151 0.00000000e+000]
Iteration: 3987 	 Current Values: [0.00000000e+000 1.52062033e+151 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.65747616e+151 0.00000000e+000]
Iteration: 3988 	 Current Values: [0.00000000e+000 1.65747616e+151 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.80664901e+151 0.00000000e+000]
Iteration: 3989 	 Current Values: [0.00000000e+000 1.80664901e+151 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.96924742e+151 0.00000000e+000]
Iteration: 3990 	 Current Values: [0.00000000e+000 1.96924742e+151 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.14647969e+151 0.00000000e+000]
Iteration: 3991 	 Current Values: [0.00000000e+000 2.14647969e+151 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.33966286e+151 0.00000000e+000]
Iteration: 3992 	 Current Values: 

Iteration: 4219 	 Current Values: [0.00000000e+000 7.32771068e+159 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.98720464e+159 0.00000000e+000]
Iteration: 4220 	 Current Values: [0.00000000e+000 7.98720464e+159 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.70605305e+159 0.00000000e+000]
Iteration: 4221 	 Current Values: [0.00000000e+000 8.70605305e+159 0.00000000e+000]
Final policy state values: [0.00000000e+000 9.48959783e+159 0.00000000e+000]
Iteration: 4222 	 Current Values: [0.00000000e+000 9.48959783e+159 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.03436616e+160 0.00000000e+000]
Iteration: 4223 	 Current Values: [0.00000000e+000 1.03436616e+160 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.12745912e+160 0.00000000e+000]
Iteration: 4224 	 Current Values: [0.00000000e+000 1.12745912e+160 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.22893044e+160 0.00000000e+000]
Iteration: 4225 	 Current Values: 

Final policy state values: [0.00000000e+000 4.46354467e+167 0.00000000e+000]
Iteration: 4427 	 Current Values: [0.00000000e+000 4.46354467e+167 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.86526369e+167 0.00000000e+000]
Iteration: 4428 	 Current Values: [0.00000000e+000 4.86526369e+167 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.30313742e+167 0.00000000e+000]
Iteration: 4429 	 Current Values: [0.00000000e+000 5.30313742e+167 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.78041979e+167 0.00000000e+000]
Iteration: 4430 	 Current Values: [0.00000000e+000 5.78041979e+167 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.30065757e+167 0.00000000e+000]
Iteration: 4431 	 Current Values: [0.00000000e+000 6.30065757e+167 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.86771675e+167 0.00000000e+000]
Iteration: 4432 	 Current Values: [0.00000000e+000 6.86771675e+167 0.00000000e+000]
Final policy state values: [0.0000

Iteration: 4632 	 Current Values: [0.00000000e+000 2.09948107e+175 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.28843437e+175 0.00000000e+000]
Iteration: 4633 	 Current Values: [0.00000000e+000 2.28843437e+175 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.49439346e+175 0.00000000e+000]
Iteration: 4634 	 Current Values: [0.00000000e+000 2.49439346e+175 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.71888887e+175 0.00000000e+000]
Iteration: 4635 	 Current Values: [0.00000000e+000 2.71888887e+175 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.96358887e+175 0.00000000e+000]
Iteration: 4636 	 Current Values: [0.00000000e+000 2.96358887e+175 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.23031187e+175 0.00000000e+000]
Iteration: 4637 	 Current Values: [0.00000000e+000 3.23031187e+175 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.52103993e+175 0.00000000e+000]
Iteration: 4638 	 Current Values: 

Final policy state values: [0.00000000e+000 4.83745138e+181 0.00000000e+000]
Iteration: 4802 	 Current Values: [0.00000000e+000 4.83745138e+181 0.00000000e+000]
Final policy state values: [0.000000e+000 5.272822e+181 0.000000e+000]
Iteration: 4803 	 Current Values: [0.000000e+000 5.272822e+181 0.000000e+000]
Final policy state values: [0.00000000e+000 5.74737598e+181 0.00000000e+000]
Iteration: 4804 	 Current Values: [0.00000000e+000 5.74737598e+181 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.26463982e+181 0.00000000e+000]
Iteration: 4805 	 Current Values: [0.00000000e+000 6.26463982e+181 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.82845741e+181 0.00000000e+000]
Iteration: 4806 	 Current Values: [0.00000000e+000 6.82845741e+181 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.44301857e+181 0.00000000e+000]
Iteration: 4807 	 Current Values: [0.00000000e+000 7.44301857e+181 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.

Iteration: 4985 	 Current Values: [0.00000000e+000 3.41716332e+188 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.72470802e+188 0.00000000e+000]
Iteration: 4986 	 Current Values: [0.00000000e+000 3.72470802e+188 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.05993174e+188 0.00000000e+000]
Iteration: 4987 	 Current Values: [0.00000000e+000 4.05993174e+188 0.00000000e+000]
Final policy state values: [0.0000000e+000 4.4253256e+188 0.0000000e+000]
Iteration: 4988 	 Current Values: [0.0000000e+000 4.4253256e+188 0.0000000e+000]
Final policy state values: [0.0000000e+000 4.8236049e+188 0.0000000e+000]
Iteration: 4989 	 Current Values: [0.0000000e+000 4.8236049e+188 0.0000000e+000]
Final policy state values: [0.00000000e+000 5.25772934e+188 0.00000000e+000]
Iteration: 4990 	 Current Values: [0.00000000e+000 5.25772934e+188 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.73092498e+188 0.00000000e+000]
Iteration: 4991 	 Current Values: [0.00000000e

Iteration: 5182 	 Current Values: [0.00000000e+000 8.06651286e+195 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.79249901e+195 0.00000000e+000]
Iteration: 5183 	 Current Values: [0.00000000e+000 8.79249901e+195 0.00000000e+000]
Final policy state values: [0.00000000e+000 9.58382393e+195 0.00000000e+000]
Iteration: 5184 	 Current Values: [0.00000000e+000 9.58382393e+195 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.04463681e+196 0.00000000e+000]
Iteration: 5185 	 Current Values: [0.00000000e+000 1.04463681e+196 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.13865412e+196 0.00000000e+000]
Iteration: 5186 	 Current Values: [0.00000000e+000 1.13865412e+196 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.24113299e+196 0.00000000e+000]
Iteration: 5187 	 Current Values: [0.00000000e+000 1.24113299e+196 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.35283496e+196 0.00000000e+000]
Iteration: 5188 	 Current Values: 

Final policy state values: [0.00000000e+000 2.75376808e+204 0.00000000e+000]
Iteration: 5410 	 Current Values: [0.00000000e+000 2.75376808e+204 0.00000000e+000]
Final policy state values: [0.0000000e+000 3.0016072e+204 0.0000000e+000]
Iteration: 5411 	 Current Values: [0.0000000e+000 3.0016072e+204 0.0000000e+000]
Final policy state values: [0.00000000e+000 3.27175185e+204 0.00000000e+000]
Iteration: 5412 	 Current Values: [0.00000000e+000 3.27175185e+204 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.56620952e+204 0.00000000e+000]
Iteration: 5413 	 Current Values: [0.00000000e+000 3.56620952e+204 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.88716837e+204 0.00000000e+000]
Iteration: 5414 	 Current Values: [0.00000000e+000 3.88716837e+204 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.23701353e+204 0.00000000e+000]
Iteration: 5415 	 Current Values: [0.00000000e+000 4.23701353e+204 0.00000000e+000]
Final policy state values: [0.00000000e+

Iteration: 5684 	 Current Values: [0.00000000e+000 4.95208165e+214 0.00000000e+000]
Final policy state values: [0.000000e+000 5.397769e+214 0.000000e+000]
Iteration: 5685 	 Current Values: [0.000000e+000 5.397769e+214 0.000000e+000]
Final policy state values: [0.00000000e+000 5.88356821e+214 0.00000000e+000]
Iteration: 5686 	 Current Values: [0.00000000e+000 5.88356821e+214 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.41308934e+214 0.00000000e+000]
Iteration: 5687 	 Current Values: [0.00000000e+000 6.41308934e+214 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.99026739e+214 0.00000000e+000]
Iteration: 5688 	 Current Values: [0.00000000e+000 6.99026739e+214 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.61939145e+214 0.00000000e+000]
Iteration: 5689 	 Current Values: [0.00000000e+000 7.61939145e+214 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.30513668e+214 0.00000000e+000]
Iteration: 5690 	 Current Values: [0.00000000e

Final policy state values: [0.0000000e+000 4.0021587e+223 0.0000000e+000]
Iteration: 5922 	 Current Values: [0.0000000e+000 4.0021587e+223 0.0000000e+000]
Final policy state values: [0.00000000e+000 4.36235299e+223 0.00000000e+000]
Iteration: 5923 	 Current Values: [0.00000000e+000 4.36235299e+223 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.75496476e+223 0.00000000e+000]
Iteration: 5924 	 Current Values: [0.00000000e+000 4.75496476e+223 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.18291158e+223 0.00000000e+000]
Iteration: 5925 	 Current Values: [0.00000000e+000 5.18291158e+223 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.64937363e+223 0.00000000e+000]
Iteration: 5926 	 Current Values: [0.00000000e+000 5.64937363e+223 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.15781725e+223 0.00000000e+000]
Iteration: 5927 	 Current Values: [0.00000000e+000 6.15781725e+223 0.00000000e+000]
Final policy state values: [0.00000000e+

Iteration: 6089 	 Current Values: [0.0000000e+000 7.1206471e+229 0.0000000e+000]
Final policy state values: [0.00000000e+000 7.76150534e+229 0.00000000e+000]
Iteration: 6090 	 Current Values: [0.00000000e+000 7.76150534e+229 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.46004082e+229 0.00000000e+000]
Iteration: 6091 	 Current Values: [0.00000000e+000 8.46004082e+229 0.00000000e+000]
Final policy state values: [0.00000000e+000 9.22144449e+229 0.00000000e+000]
Iteration: 6092 	 Current Values: [0.00000000e+000 9.22144449e+229 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.00513745e+230 0.00000000e+000]
Iteration: 6093 	 Current Values: [0.00000000e+000 1.00513745e+230 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.09559982e+230 0.00000000e+000]
Iteration: 6094 	 Current Values: [0.00000000e+000 1.09559982e+230 0.00000000e+000]
Final policy state values: [0.0000000e+000 1.1942038e+230 0.0000000e+000]
Iteration: 6095 	 Current Values: [0.000

Final policy state values: [0.00000000e+000 1.38092826e+236 0.00000000e+000]
Iteration: 6257 	 Current Values: [0.00000000e+000 1.38092826e+236 0.00000000e+000]
Final policy state values: [0.0000000e+000 1.5052118e+236 0.0000000e+000]
Iteration: 6258 	 Current Values: [0.0000000e+000 1.5052118e+236 0.0000000e+000]
Final policy state values: [0.00000000e+000 1.64068086e+236 0.00000000e+000]
Iteration: 6259 	 Current Values: [0.00000000e+000 1.64068086e+236 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.78834214e+236 0.00000000e+000]
Iteration: 6260 	 Current Values: [0.00000000e+000 1.78834214e+236 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.94929293e+236 0.00000000e+000]
Iteration: 6261 	 Current Values: [0.00000000e+000 1.94929293e+236 0.00000000e+000]
Final policy state values: [0.0000000e+000 2.1247293e+236 0.0000000e+000]
Iteration: 6262 	 Current Values: [0.0000000e+000 2.1247293e+236 0.0000000e+000]
Final policy state values: [0.00000000e+000 2.

Final policy state values: [0.00000000e+000 1.06327778e+243 0.00000000e+000]
Iteration: 6441 	 Current Values: [0.00000000e+000 1.06327778e+243 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.15897278e+243 0.00000000e+000]
Iteration: 6442 	 Current Values: [0.00000000e+000 1.15897278e+243 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.26328033e+243 0.00000000e+000]
Iteration: 6443 	 Current Values: [0.00000000e+000 1.26328033e+243 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.37697556e+243 0.00000000e+000]
Iteration: 6444 	 Current Values: [0.00000000e+000 1.37697556e+243 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.50090336e+243 0.00000000e+000]
Iteration: 6445 	 Current Values: [0.00000000e+000 1.50090336e+243 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.63598466e+243 0.00000000e+000]
Iteration: 6446 	 Current Values: [0.00000000e+000 1.63598466e+243 0.00000000e+000]
Final policy state values: [0.0000

Final policy state values: [0.00000000e+000 1.21299564e+252 0.00000000e+000]
Iteration: 6683 	 Current Values: [0.00000000e+000 1.21299564e+252 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.32216525e+252 0.00000000e+000]
Iteration: 6684 	 Current Values: [0.00000000e+000 1.32216525e+252 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.44116012e+252 0.00000000e+000]
Iteration: 6685 	 Current Values: [0.00000000e+000 1.44116012e+252 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.57086453e+252 0.00000000e+000]
Iteration: 6686 	 Current Values: [0.00000000e+000 1.57086453e+252 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.71224234e+252 0.00000000e+000]
Iteration: 6687 	 Current Values: [0.00000000e+000 1.71224234e+252 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.86634415e+252 0.00000000e+000]
Iteration: 6688 	 Current Values: [0.00000000e+000 1.86634415e+252 0.00000000e+000]
Final policy state values: [0.0000

Final policy state values: [0.00000000e+000 4.34641968e+262 0.00000000e+000]
Iteration: 6965 	 Current Values: [0.00000000e+000 4.34641968e+262 0.00000000e+000]
Final policy state values: [0.00000000e+000 4.73759745e+262 0.00000000e+000]
Iteration: 6966 	 Current Values: [0.00000000e+000 4.73759745e+262 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.16398122e+262 0.00000000e+000]
Iteration: 6967 	 Current Values: [0.00000000e+000 5.16398122e+262 0.00000000e+000]
Final policy state values: [0.00000000e+000 5.62873953e+262 0.00000000e+000]
Iteration: 6968 	 Current Values: [0.00000000e+000 5.62873953e+262 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.13532609e+262 0.00000000e+000]
Iteration: 6969 	 Current Values: [0.00000000e+000 6.13532609e+262 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.68750544e+262 0.00000000e+000]
Iteration: 6970 	 Current Values: [0.00000000e+000 6.68750544e+262 0.00000000e+000]
Final policy state values: [0.0000

Final policy state values: [0.00000000e+000 2.48846867e+271 0.00000000e+000]
Iteration: 7199 	 Current Values: [0.00000000e+000 2.48846867e+271 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.71243085e+271 0.00000000e+000]
Iteration: 7200 	 Current Values: [0.00000000e+000 2.71243085e+271 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.95654962e+271 0.00000000e+000]
Iteration: 7201 	 Current Values: [0.00000000e+000 2.95654962e+271 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.22263909e+271 0.00000000e+000]
Iteration: 7202 	 Current Values: [0.00000000e+000 3.22263909e+271 0.00000000e+000]
Final policy state values: [0.00000000e+000 3.51267661e+271 0.00000000e+000]
Iteration: 7203 	 Current Values: [0.00000000e+000 3.51267661e+271 0.00000000e+000]
Final policy state values: [0.0000000e+000 3.8288175e+271 0.0000000e+000]
Iteration: 7204 	 Current Values: [0.0000000e+000 3.8288175e+271 0.0000000e+000]
Final policy state values: [0.00000000e+

Iteration: 7452 	 Current Values: [0.00000000e+000 7.32548116e+280 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.98477446e+280 0.00000000e+000]
Iteration: 7453 	 Current Values: [0.00000000e+000 7.98477446e+280 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.70340416e+280 0.00000000e+000]
Iteration: 7454 	 Current Values: [0.00000000e+000 8.70340416e+280 0.00000000e+000]
Final policy state values: [0.00000000e+000 9.48671054e+280 0.00000000e+000]
Iteration: 7455 	 Current Values: [0.00000000e+000 9.48671054e+280 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.03405145e+281 0.00000000e+000]
Iteration: 7456 	 Current Values: [0.00000000e+000 1.03405145e+281 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.12711608e+281 0.00000000e+000]
Iteration: 7457 	 Current Values: [0.00000000e+000 1.12711608e+281 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.22855653e+281 0.00000000e+000]
Iteration: 7458 	 Current Values: 

Iteration: 7717 	 Current Values: [0.00000000e+000 6.06538133e+290 0.00000000e+000]
Final policy state values: [0.00000000e+000 6.61126565e+290 0.00000000e+000]
Iteration: 7718 	 Current Values: [0.00000000e+000 6.61126565e+290 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.20627955e+290 0.00000000e+000]
Iteration: 7719 	 Current Values: [0.00000000e+000 7.20627955e+290 0.00000000e+000]
Final policy state values: [0.00000000e+000 7.85484471e+290 0.00000000e+000]
Iteration: 7720 	 Current Values: [0.00000000e+000 7.85484471e+290 0.00000000e+000]
Final policy state values: [0.00000000e+000 8.56178074e+290 0.00000000e+000]
Iteration: 7721 	 Current Values: [0.00000000e+000 8.56178074e+290 0.00000000e+000]
Final policy state values: [0.000000e+000 9.332341e+290 0.000000e+000]
Iteration: 7722 	 Current Values: [0.000000e+000 9.332341e+290 0.000000e+000]
Final policy state values: [0.00000000e+000 1.01722517e+291 0.00000000e+000]
Iteration: 7723 	 Current Values: [0.00000000e

Iteration: 7941 	 Current Values: [0.00000000e+000 1.46687683e+299 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.59889574e+299 0.00000000e+000]
Iteration: 7942 	 Current Values: [0.00000000e+000 1.59889574e+299 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.74279636e+299 0.00000000e+000]
Iteration: 7943 	 Current Values: [0.00000000e+000 1.74279636e+299 0.00000000e+000]
Final policy state values: [0.00000000e+000 1.89964803e+299 0.00000000e+000]
Iteration: 7944 	 Current Values: [0.00000000e+000 1.89964803e+299 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.07061636e+299 0.00000000e+000]
Iteration: 7945 	 Current Values: [0.00000000e+000 2.07061636e+299 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.25697183e+299 0.00000000e+000]
Iteration: 7946 	 Current Values: [0.00000000e+000 2.25697183e+299 0.00000000e+000]
Final policy state values: [0.00000000e+000 2.46009929e+299 0.00000000e+000]
Iteration: 7947 	 Current Values: 

  V[s] += policy[s][a]*next_state_probabilities*(rewards[s, a, ns] + gamma * initial_values[ns])
  V[s] += policy[s][a]*next_state_probabilities*(rewards[s, a, ns] + gamma * initial_values[ns])


ValueError: All-NaN slice encountered

In [1]:
from abc import ABC, abstractmethod
import numpy as np
from typing import List, Tuple, Dict, Optional, Hashable

from rl2023.constants import EX1_CONSTANTS as CONSTANTS
from rl2023.exercise1.mdp import MDP, Transition, State, Action

class MDPSolver(ABC):
    """Base class for MDP solvers

    **DO NOT CHANGE THIS CLASS**

    :attr mdp (MDP): MDP to solve
    :attr gamma (float): discount factor gamma to use
    :attr action_dim (int): number of actions in the MDP
    :attr state_dim (int): number of states in the MDP
    """

    def __init__(self, mdp: MDP, gamma: float):
        """Constructor of MDPSolver

        Initialises some variables from the MDP, namely the state and action dimension variables

        :param mdp (MDP): MDP to solve
        :param gamma (float): discount factor (gamma)
        """
        self.mdp: MDP = mdp
        self.gamma: float = gamma

        self.action_dim: int = len(self.mdp.actions)
        self.state_dim: int = len(self.mdp.states)

    def decode_policy(self, policy: Dict[int, np.ndarray]) -> Dict[State, Action]:
        """Generates greedy, deterministic policy dict

        Given a stochastic policy from state indeces to distribution over actions, the greedy,
        deterministic policy is generated choosing the action with highest probability

        :param policy (Dict[int, np.ndarray of float with dim (num of actions)]):
            stochastic policy assigning a distribution over actions to each state index
        :return (Dict[State, Action]): greedy, deterministic policy from states to actions
        """
        new_p = {}
        for state, state_idx in self.mdp._state_dict.items():
            new_p[state] = self.mdp.actions[np.argmax(policy[state_idx])]
        return new_p

    @abstractmethod
    def solve(self):
        """Solves the given MDP
        """
        ...