# Homework 7

In [None]:
import abc
from collections import defaultdict
import numpy as np


class MDP:
  """A Markov Decision Process."""

  @property
  @abc.abstractmethod
  def state_space(self):
    """Representation of the MDP state set.
    """
    raise NotImplementedError("Override me")

  @property
  @abc.abstractmethod
  def action_space(self):
    """Representation of the MDP action set.
    """
    raise NotImplementedError("Override me")

  @abc.abstractmethod
  def get_transition_distribution(self, state, action):
    """Return a distribution over next states.

    The form of this distribution will vary, e.g., depending
    on whether the MDP has discrete or continuous states.

    Args:
      state: A current state.
      action: An action.

    Returns:
      next_state_distribution: Distribution over next states.
    """
    raise NotImplementedError("Override me")

  @abc.abstractmethod
  def get_reward(self, state, action, next_state):
    """Return (deterministic) reward for executing action
    in state.

    Args:
      state: A current state.
      action: An action.
      next_state: A next state.

    Returns:
      reward: Single time step reward.
    """
    raise NotImplementedError("Override me")

  @abc.abstractmethod
  def state_is_terminal(self, state):
    """Designate certain states as terminal (done) states.

    Args:
      state: A state.

    Returns:
      is_terminal : A bool.
    """
    raise NotImplementedError("Override me")


# Example MDPs for tests
class Debug1DGridMDP(MDP):
  """A 1D grid MDP for debugging. The grid is 1x5
  and the agent is meant to start off in the middle.
  There is +10 reward on the rightmost square, -10 on
  the left. Actions are left and right. An action effect
  is reversed with 10% probability.
  """
  @property
  def state_space(self):
    return [0, 1, 2, 3, 4]  # position in grid

  @property
  def action_space(self):
    return [0, 1]  # left, right

  def get_transition_distribution(self, state, action):
    # Discrete distributions, represented with a dict
    # mapping next states to probs.
    delta = 1 if action == 1 else -1
    intended_effect = min(max(state + delta, 0), 4)
    opposite_effect = min(max(state - delta, 0), 4)
    assert (intended_effect != opposite_effect)
    return {intended_effect: 0.9, opposite_effect: 0.1}

  def get_reward(self, state, action, next_state):
    if next_state == 0:
      return -10
    if next_state == 4:
      return 10
    return -1  # living penalty

  def state_is_terminal(self, state):
    return state in [0, 4]


class GridMDP(MDP):
  """A 2D grid MDP.

  Action effects are stochastic: with 0.75 probability, the action has
  the intended effect, otherwise a random local move is taken.

  The grid is determined by a 2D array of obstacles.

  Rewards are 
   - +1 for reaching the goal
   - -1 for reaching a trap
   - -1e-3 for each step
  """

  @property
  def obstacles(self):
    return np.array([
      [0, 0, 1, 1, 0],
      [0, 0, 1, 0, 0],
      [1, 0, 0, 0, 1],
      [1, 1, 0, 0, 1],
      [0, 0, 0, 0, 0],
    ])

  @property
  def goal(self):
    return (4, 4)

  @property
  def traps(self):
    return [(0, 1), (4, 0)]

  @property
  def goal_reward(self):
    return 1.

  @property
  def trap_reward(self):
    return -1.

  @property
  def living_reward(self):
    return -1e-3

  @property
  def height(self):
    return self.obstacles.shape[0]

  @property
  def width(self):
    return self.obstacles.shape[1]

  @property
  def state_space(self):
    return [(r, c) for r in range(self.height) for c in range(self.width)]

  @property
  def action_space(self):
    return ['up', 'down', 'left', 'right']

  def action_to_delta(self, action):
    return {
      'up': (-1, 0),  # up,
      'down': (1, 0),  # down,
      'left': (0, -1),  # left,
      'right': (0, 1),  # right,
    }[action]

  def get_transition_distribution(self, state, action):
    # Discrete distributions, represented with a dict
    # mapping next states to probs.
    row, col = state

    next_state_distribution = defaultdict(float)

    for a in self.action_space:
      dr, dc = self.action_to_delta(a)
      r, c = row + dr, col + dc
      # Stay in place if out of bounds or obstacle
      if not (0 <= r < self.height and 0 <= c < self.width):
        r, c = row, col
      elif self.obstacles[r, c]:
        r, c = row, col
      if a == action:
        p = 0.75
      else:
        p = 0.25 / (len(self.action_space) - 1)
      next_state_distribution[(r, c)] += p

    return next_state_distribution

  def get_reward(self, state, action, next_state):
    if next_state == self.goal:
      return self.goal_reward
    elif next_state in self.traps:
      return self.trap_reward
    return self.living_reward

  def state_is_terminal(self, state):
    return state in [self.goal] + self.traps


## Problems

### Policy Evaluation
Complete the following implementation of iterative policy evaluation.

In [None]:
def evaluate_policy(pi, mdp, max_num_iterations=1000, change_threshold=0.0001, gamma=1.0):
  """Computes a value function for a policy in an MDP.

  Assumes that mdp has finite state and action spaces.

  Args:
    pi: A dict that maps a state to an action in the MDP.
    mdp : An MDP.
    max_num_iterations: An int representing the maximum number of
      iterations to run iteration before giving up.
    change_threshold: A float used to determine when iteration
      has converged and it is safe to terminate.
    gamma: A float temporal discount factor between 0 and 1.

  Returns:
    value_function: A dict from states to values.
  """
  raise NotImplementedError("Implement me!")

Tests

In [None]:
mdp = Debug1DGridMDP()
good_policy = {s: 1 for s in mdp.state_space}
expected_V = {0: 0.0, 1: 5.585314516769024, 2: 8.317064005349863, 3: 9.731701612974335, 4: 0.0}
V = evaluate_policy(good_policy, mdp)
assert all(abs(expected_V[s] - V[s]) < 1e-5 for s in mdp.state_space)

mdp = Debug1DGridMDP()
bad_policy = {s: 0 for s in mdp.state_space}
V = evaluate_policy(bad_policy, mdp)
expected_V = {0: 0.0, 1: -10.219505953464065, 2: -11.195109614239257, 3: -9.975553581176575, 4: 0.0}
assert all(abs(expected_V[s] - V[s]) < 1e-5 for s in mdp.state_space)

mdp = Debug1DGridMDP()
mixed_policy = {s: 1 if s >= 2 else 0 for s in mdp.state_space}
V = evaluate_policy(mixed_policy, mdp)
expected_V = {0: 0.0, 1: -8.422229, 2: 6.7777, 3: 9.577771, 4: 0.0}
assert all(abs(expected_V[s] - V[s]) < 1e-5 for s in mdp.state_space)

print('Tests passed.')

### Value Iteration
Complete the following implementation of value iteration.

In [None]:
def value_iteration(mdp, max_num_iterations=1000, change_threshold=0.0001, gamma=0.99, initial_Q=None, terminal_state_reward_fn=None, print_every=None, render_value_functions=False):
  """Run value iteration for a certain number of iterations or until
  the max change between iterations is below a threshold.

  MDP must have discrete state and action spaces.

  Gamma is the temporal discount factor.

  Returns a policy defined over non-terminal states.

  Args:
    mdp: An MDP.
    max_num_iterations: An int representing the maximum number of
        iterations to run value iteration before giving up.
    change_threshold: A float used to determine when value iteration
        has converged and it is safe to terminate.
    gamma: A float temporal discount factor between 0 and 1.

  Returns: 
    pi: A dict; pi[state] -> action is the policy.
  """
  raise NotImplementedError("Implement me!")

Tests

In [None]:
assert value_iteration(Debug1DGridMDP()) == {1: 1, 2: 1, 3: 1}
assert value_iteration(GridMDP()) == {(0, 0): "down", (0, 2): "right", (0, 3): "down", (0, 4): "down", (1, 0): "right", (1, 1): "down", (1, 2): "down", (1, 3): "down", (1, 4): "left", (2, 0): "right", (2, 1): "right", (2, 2): "down", (2, 3): "left", (2, 4): "left", (3, 0): "right", (3, 1): "right", (3, 2): "down", (3, 3): "down", (3, 4): "down", (4, 1): "right", (4, 2): "right", (4, 3): "right"}
print('Tests passed.')