# Homework 8

## Imports and Utilities
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
from collections import defaultdict
import abc
import numpy as np
import sys
import pdb

def press(event):
  """matplotlib helper function. It processes the keyboard event. If the user hits q, it exit the program."""
  import matplotlib.pyplot as plt
  if event.key == 'q':
    sys.exit(0)
  plt.close()


def plot_value_function(V, mdp):
  """matplotlib helper function. It takes a value function assumed to be a
  maze and plots it.

  Args:
    V: a dict of {(r, c) : value}
  """

  import matplotlib.pyplot as plt
  image = np.zeros((mdp.height, mdp.width))
  for state in V.keys():
    image[state[0], state[1]] = V[state]

  cmap = plt.cm.binary
  norm = plt.Normalize(min(V.values()), max(V.values()))
  rgba = cmap(norm(image))

  for r in range(mdp.height):
    for c in range(mdp.width):
      if (mdp.obstacles[r][c]):
        rgba[r, c, :3] = 1, 0, 0

  if mdp.hazards:
    for hazard in mdp.hazards:
      rgba[hazard[0], hazard[1], :3] = 0, 0, 1

  fig, ax = plt.subplots()
  fig.canvas.mpl_connect('key_press_event', press)
  ax.imshow(rgba)

  #
  # Note that the image coordinates are in (row, column) order, but the plt.arrow
  # command is (x, y). If the rows are y-coordinates, then these two commands
  # use different coordinate conventions. Never mind that image coordinates
  # also have the origin in the top left corner, and is left-handed.
  #
  # Computer graphics has much to answer for.
  #

  for state in V.keys():
    r, c = state
    delta = mdp.action_to_delta(mdp.get_action(V, state))
    plt.arrow(c, r, delta[1]*.35, delta[0]*.35, width = 0.05)

  plt.show()


def check_value_function(sub, sol):
  assert set(sub) == set(sol), 'Sets of entries do not match.'
  for k, v in sol.items():
    assert abs(sub[k] - v) < 1e-5, f'Value do not match for {k}: expect {v}, got {sub[k]}'
  return True


class MDP:
    """A Markov Decision Process."""

    @property
    @abc.abstractmethod
    def state_space(self):
        """Representation of the MDP state set.

        Unless otherwise stated, assume this is a set.
        """
        raise NotImplementedError("Override me")

    @property
    @abc.abstractmethod
    def action_space(self):
        """Representation of the MDP action set.

        Unless otherwise stated, assume this is a set.
        """
        raise NotImplementedError("Override me")

    @property
    def temporal_discount_factor(self):
        """Gamma, defaults to 1.
        """
        return 1.

    @property
    def horizon(self):
        """H, defaults to inf.
        """
        return float("inf")

    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states.

        Defaults to False.

        Args:
            state: A state.

        Returns:
            is_terminal : A bool.
        """
        return False

    @abc.abstractmethod
    def get_reward(self, state, action, next_state):
        """Return (deterministic) reward for executing action
        in state.

        Args:
            state: A current state.
            action: An action.
            next_state: A next state.

        Returns:
            reward : Single time step reward.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_transition_distribution(self, state, action):
        """Return a distribution over next states.

        Unless otherwise stated, assume that this returns
        a dictionary mapping states to probabilities. For
        example, if the state space were {0, 1, 2}, then
        this function might return {0: 0.3, 1: 0.2, 2: 0.5}.

        Args:
            state: A current state.
            action: An action.

        Returns:
            next_state_distribution: Distribution over next states.
        """
        raise NotImplementedError("Override me")

    def sample_next_state(self, state, action, rng=np.random):
        """Sample a next state from the transition distribution.

        This function may be overwritten by subclasses when the explicit
        distribution is too large to enumerate.

        Args:
            state: A state from the state space.
            action: An action from the action space.
            rng: A random number generator.

        Returns:
            next_state: A sampled next state from the state space.
        """
        next_state_dist = self.get_transition_distribution(state, action)
        next_states, probs = zip(*next_state_dist.items())
        next_state_index = rng.choice(len(next_states), p=probs)
        next_state = next_states[next_state_index]
        return next_state


class SingleRowMDP(MDP):
    """A 1D grid MDP for debugging. The grid is 1x5
    and the agent is meant to start off in the middle.
    There is +10 reward on the rightmost square, -10 on
    the left. Actions are left and right. An action effect
    is reversed with 10% probability.
    """
    @property
    def state_space(self):
        return {0, 1, 2, 3, 4}  # position in grid

    @property
    def action_space(self):
        return {0, 1}  # left, right

    def get_transition_distribution(self, state, action):
        # Discrete distributions, represented with a dict
        # mapping next states to probs.
        delta = 1 if action == 1 else -1
        intended_effect = min(max(state + delta, 0), 4)
        opposite_effect = min(max(state - delta, 0), 4)
        assert (intended_effect != opposite_effect)
        return {intended_effect: 0.9, opposite_effect: 0.1}

    def get_reward(self, state, action, next_state):
        if next_state == 0:
          return -10
        if next_state == 4:
          return 10
        return -1  # living penalty

    def state_is_terminal(self, state):
        return state in {0, 4}


class ZitsMDP(MDP):
    """The Zits MDP described in lecture."""

    @property
    def state_space(self):
        return {0, 1, 2, 3, 4}

    @property
    def action_space(self):
        return {"apply", "sleep"}

    @property
    def temporal_discount_factor(self):
        return 0.9

    def get_reward(self, state, action, next_state):
        if action == "apply":
            return -1 - next_state
        assert action == "sleep"
        return -next_state

    def get_transition_distribution(self, state, action):
        if action == "apply":
            return {
                0: 0.8,
                4: 0.2
            }
        assert action == "sleep"
        return {
            min(state + 1, 4): 0.4,
            max(state - 1, 0): 0.6
        }


class ChaseMDP(MDP):
    """A 2D grid bunny chasing MDP."""

    @property
    def obstacles(self):
        return np.zeros((2, 3))  # by default, 2x3 grid with no obstacles

    @property
    def goal_reward(self):
        return 1

    @property
    def living_reward(self):
        return 0

    @property
    def height(self):
        return self.obstacles.shape[0]

    @property
    def width(self):
        return self.obstacles.shape[1]

    @property
    def state_space(self):
        pos = [(r, c) for r in range(self.height) for c in range(self.width)]
        return {(p1, p2) for p1 in pos for p2 in pos}

    @property
    def action_space(self):
        return {'up', 'down', 'left', 'right'}

    @property
    def temporal_discount_factor(self):
        return 0.9

    def action_to_delta(self, action):
        return {
            'up': (-1, 0),  # up,
            'down': (1, 0),  # down,
            'left': (0, -1),  # left,
            'right': (0, 1),  # right,
        }[action]

    def get_transition_distribution(self, state, action):
        # Discrete distributions, represented with a dict
        # mapping next states to probs.
        next_state_dist = defaultdict(float)

        agent_pos, goal_pos = state

        # Get next agent state
        row, col = agent_pos
        dr, dc = self.action_to_delta(action)
        r, c = row + dr, col + dc
        # Stay in place if out of bounds or obstacle
        if not (0 <= r < self.height and 0 <= c < self.width):
            r, c = row, col
        elif self.obstacles[r, c]:
            r, c = row, col
        next_agent_pos = (r, c)

        # Get next bunny state
        # Stay in same place with probability 0.5
        next_state_dist[(next_agent_pos, goal_pos)] += 0.5
        # Otherwise move
        row, col = goal_pos
        for (dr, dc) in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
            r, c = row + dr, col + dc
            # Stay in place if out of bounds or obstacle
            if not (0 <= r < self.height and 0 <= c < self.width):
                r, c = row, col
            elif self.obstacles[r, c]:
                r, c = row, col
            next_goal_pos = (r, c)
            next_state_dist[(next_agent_pos, next_goal_pos)] += 0.5*0.25

        return next_state_dist

    def get_reward(self, state, action, next_state):
        agent_pos, goal_pos = next_state
        if agent_pos == goal_pos:
            return self.goal_reward
        return self.living_reward

    def state_is_terminal(self, state):
        agent_pos, goal_pos = state
        return agent_pos == goal_pos


class RescueMDP(MDP):
    """A 2D grid Rescue MDP. We know where the person to be rescued is, and we
    have to go get them."""

    _goal_location = (4, 5)
    goal_reward = 100
    living_reward = 0
    hazard_cost = -100
    hazards = set()
    temporal_discount_factor = .9
    goal_is_terminal = True

    """This is the probability that the action has the intended effect. It
    needs to be <= 1. If it is less than 1, the rest of the probability mass
    is distributed among the other 3 actions. """

    _correct_transition_probability = .97
    _noise_transition_probability = .01

    @property
    def obstacles(self):
      return np.zeros((2, 3))  # by default, 2x3 grid with no obstacles

    @property
    def goal_location(self):
      return self._goal_location

    @goal_location.setter
    def goal_location(self, location):
      assert location[0] < self.width
      assert location[1] < self.height
      assert not self.obstacles[location[0]][location[1]]
      self._goal_location = location

    @property
    def height(self):
      return self.obstacles.shape[0]

    @property
    def width(self):
      return self.obstacles.shape[1]

    @property
    def state_space(self):
      return {(r, c) for r in range(self.height) for c in range(self.width)
              if not self.obstacles[r][c]}

    @property
    def action_space(self):
      return {'up', 'down', 'left', 'right'}

    def action_to_delta(self, action):
      return {
        'up': (-1, 0),  # up,
        'down': (1, 0),  # down,
        'left': (0, -1),  # left,
        'right': (0, 1),  # right,
      }[action]

    def get_action(self, V, s):

      best_value = -float("inf")
      best_action = False

      for a in self.action_space:
        qsa = 0.
        for ns, p in self.get_transition_distribution(s, a).items():
          r = self.get_reward(s, a, ns)
          qsa += p * (r + self.temporal_discount_factor * V[ns])
        if qsa > best_value:
          best_value = qsa
          best_action = a

      return best_action


    @property
    def correct_transition_probability(self):
      return self._correct_transition_probability

    @correct_transition_probability.setter
    def correct_transition_probability(self, prob):
      assert prob >= 0 and prob <= 1

      # This setter function allows the user to specify a probability of an
      # action having the intended effect, e.g., the likelihood that the 'up'
      # action moves the agent 'up'. The probability has to be between 0 and
      # 1. If it is less than 1, all the remaining probability mass is equally
      # distributed among the other three directions.

      self._correct_transition_probability = prob
      self._noise_transition_probability = (1.0 - prob) / 3.0

    @property
    def noise_transition_probability(self):
      return self._noise_transition_probability

    def get_transition_distribution(self, state, action):
      # Discrete distributions, represented with a dict
      # mapping next states to probs.
      next_state_dist = defaultdict(float)

      if self.state_is_terminal(state):
        return {state: 1.0}

      row, col = state
      for (dr, dc) in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
        r, c = row + dr, col + dc
        # Stay in place if out of bounds or obstacle
        if not (0 <= r < self.height and 0 <= c < self.width):
          r, c = row, col
        elif self.obstacles[r, c]:
          r, c = row, col
        next_agent_pos = (r, c)
        if self.action_to_delta(action) == (dr, dc):
          next_state_dist[next_agent_pos] += self.correct_transition_probability
        else:
          next_state_dist[next_agent_pos] += self.noise_transition_probability

      return next_state_dist

    def get_reward(self, state, action, next_state):
      agent_pos = next_state
      if agent_pos == self.goal_location:
        return self.goal_reward
      if self.hazards and next_state in self.hazards:
        return self.hazard_cost
      return self.living_reward

    def state_is_terminal(self, state):
      if not self.goal_is_terminal:
        return False
      return state == self.goal_location

class SmallRescueMDP(RescueMDP):
  """A small 2D grid MDP."""

  goal_location = (0, 0)
  @property
  def obstacles(self):
    return np.array([
      [0, 0, 0, 0, 0],
      [0, 0, 0, 1, 0],
      [0, 0, 0, 1, 0],
      [0, 1, 0, 1, 1]
      ])

class LargeRescueMDP(RescueMDP):
  """A larger 2D grid MDP."""

  @property
  def obstacles(self):
    return np.array([
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 1, 0, 0, 0, 0, 1, 1],
      [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
      [0, 1, 0, 1, 0, 0, 1, 0, 0, 0],
      [0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
      [0, 1, 1, 0, 0, 0, 0, 1, 0, 0],
      [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      ])




## Wait, Bellman, Backup!


### Question
Complete the implementation of the bellman backup for an infinite or indefinite horizon MDP.

For reference, our solution is **13** line(s) of code.

In [None]:
def bellman_backup(s, V, mdp):
    """Look ahead one step and propose an update for the value of s.

    You can assume that the mdp is either infinite or indefinite
    horizon (that is, mdp.horizon is inf).

    It is possible to handle terminal states either here or in
    value iteration. For consistency with our solution, please
    handle terminal states in value iteration, not here.

    Args:
        s: A state.
        V: A dict, V[state] -> value.
        mdp: An MDP.

    Returns:
        vs: new value estimate for s.
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def test1_bellman_backup():
    mdp = SingleRowMDP()
    s = 3
    V = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}
    new_V_s = bellman_backup(s, V, mdp)
    # Bellman backup should not change V
    assert V == {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}
    assert new_V_s == 0.9 * 10 + 0.1 * -1
    s = 2
    new_V_s = bellman_backup(s, V, mdp)
    assert new_V_s == -1.

test1_bellman_backup()


def test2_bellman_backup():
    mdp = ZitsMDP()
    V = {s : 0 for s in mdp.state_space}
    assert bellman_backup(0, V, mdp) == -0.4
    assert bellman_backup(1, V, mdp) == -0.8
    assert bellman_backup(2, V, mdp) == -1.8
    assert bellman_backup(3, V, mdp) == -1.8
    assert bellman_backup(4, V, mdp) == -1.8

test2_bellman_backup()


def test3_bellman_backup():
    mdp = ZitsMDP()
    V = {0 : -0.1, 1: 0.1, 2: 5, 3: -4, 4: -2.2}
    assert abs(bellman_backup(0, V, mdp) - -0.418) < 1e-5
    assert abs(bellman_backup(1, V, mdp) - 0.946) < 1e-5
    assert abs(bellman_backup(2, V, mdp) - -2.268) < 1e-5
    assert abs(bellman_backup(3, V, mdp) - -0.892) < 1e-5
    assert abs(bellman_backup(4, V, mdp) - -2.268) < 1e-5

test3_bellman_backup()

print('Tests passed.')

## There's Value in that Iteration


### Question
Complete the implementation of value iteration for an infinite or indefinite horizon MDP.

For reference, our solution is **20** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `bellman_backup`. You may not need to use all of them.

In [None]:
def value_iteration(mdp, max_num_iters=1000, change_threshold=1e-4):
    """Run value iteration for a certain number of iterations or until
    the max change between iterations is below a threshold.

    Specifically, you should terminate when:
        (max_{s} |V(s) - V'(s)|) < change_threshold
    where V is the old value function estimate, V' is the new one,
    and |*| denotes absolute value.

    You can assume that the mdp is either infinite or indefinite
    horizon (that is, mdp.horizon is inf).

    Make sure to handle terminal states! You will need to think about
    what behavior we should expect from value iteration exactly to
    deal with terminal states, and then implement that behavior.

    Args:
        mdp: An MDP.
        max_num_iters: An int representing the maximum number of
            iterations to run value iteration before giving up.
        change_threshold: A float used to determine when value iteration
            has converged and it is safe to terminate.

    Returns: 
        V:  A dict, V[state] -> value.
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def test1_value_iteration():
    mdp = SingleRowMDP()
    V = value_iteration(mdp)
    expected_V = {0: 0.0, 1: 5.58531, 2: 8.31706, 3: 9.73170, 4: 0.0}
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4

test1_value_iteration()


def test2_value_iteration():
    mdp = ZitsMDP()
    V = value_iteration(mdp)
    expected_V = {0: -6.40530, 1: -7.07368, 2: -7.81918, 3: -7.81918, 4: -7.81918}
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4

test2_value_iteration()


def test3_value_iteration():
    mdp = SingleRowMDP()
    expected_V = {0: 0.0, 1: -1.9, 2: -1.0, 3: 8.9, 4: 0.0}
    V = value_iteration(mdp, max_num_iters=1)
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4
    V = value_iteration(mdp, change_threshold=float("inf"))
    for s in mdp.state_space:
        assert abs(V[s] - expected_V[s]) < 1e-4

test3_value_iteration()


def test4_value_iteration():
    mdp = ChaseMDP()
    V = value_iteration(mdp)
    partial_expected_V = {((0, 1), (0, 1)): 0.0, ((0, 1), (1, 0)): 0.87506,
                          ((1, 0), (0, 2)): 0.80601, ((0, 2), (1, 2)): 0.96536,
                          ((1, 1), (0, 1)): 0.94896}
    for s in partial_expected_V:
        assert abs(V[s] - partial_expected_V[s]) < 1e-4

test4_value_iteration()

print('Tests passed.')

## MDP Question 1


### Question

We have provided a RescueMDP class for you, that models the robot's motion
as noisy. This class is in many ways similar to the ChaseMDP class
in that it is a maze MDP with obstacles. However, there's just the one
agent (the robot, no bunny).

Recall that an MDP is defined by the following tuple:
* States: The state space is a set of coordinates. The set is defined over a
grid, but states that aren't obstacles aren't in the state space.
* Actions: The robot can take four actions, up, down, left, right.
* A transition function: The probability the action succeeds at moving the
robot in the given direction is given by the class field
`correct_transition_probability`. If this probability is less than 1, then the
rest of the probability mass is uniformly distributed among the other three
directions. Any probability mass for a motion that would move the robot into
an obstacle or out of the map is mapped to the robot staying the same
place. If the robot is in the goal state and the MDP has the flag
`goal_is_terminal = True`, it cannot transition out of the goal state. If the flag
`goal_is_terminal = False`, the robot is free to leave the goal state and
re-enter it.
* Reward function: The robot gets a reward of `living_reward` for each action
it takes. It gets reward of `goal_reward` every time it enters the goal
state.

Remember that arrays are row-major order, that is, the arrays are indexed by
(row, column). The person to be rescued (goal_location) is at (0, 0).

We also want you to be able to examine the policy that results from solving
for the optimal value function. Please use the helper function
`plot_value_function` to generate a plot of both the value function and the
corresponding policy.

Please create a LargeRescueMDP and compute the optimal value function.


For reference, our solution is **4** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `bellman_backup`, `value_iteration`. You may not need to use all of them.

In [None]:
def MDP_1():
    """Creates a LargeRescueMDP(), and returns the optimal value function.

    Args:
      None

    Returns:
      value function: a dict of states to values
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def vi_test(function, V):
  import random; random.seed(0)
  import numpy.random as npr; npr.seed(0)
  Vsub = function()
  assert check_value_function(Vsub, V)

vi_test(MDP_1, {(4, 0): 26.97240030571446, (3, 4): 89.28524333009564, (4, 9): 46.28196469917531, (3, 7): 63.866705918797976, (5, 4): 89.19904388402188, (8, 0): 41.56749854170138, (0, 2): 51.77167752199026, (8, 3): 57.77191141593772, (8, 9): 46.46228722379235, (0, 5): 71.33026555348314, (2, 2): 41.5179757146076, (1, 0): 37.29501402141274, (1, 6): 71.46317864388475, (0, 8): 51.577016904150774, (2, 5): 89.01900243377555, (8, 6): 64.24856551191186, (2, 8): 63.866697606203566, (7, 4): 71.6813569329961, (7, 1): 41.52806356045833, (7, 7): 64.1906123496321, (6, 5): 88.9353734615529, (6, 8): 63.99437878087923, (4, 2): 33.35853220876574, (3, 0): 29.997978602266883, (4, 5): 0.0, (3, 9): 51.378797571401286, (5, 0): 30.00062894204498, (4, 8): 51.477861619010575, (5, 6): 89.10343277693175, (5, 3): 79.95573398571852, (5, 9): 51.427184372629654, (8, 2): 51.784353648073704, (8, 5): 71.26442203181726, (0, 1): 46.36201563185604, (0, 7): 57.53968483878395, (2, 4): 80.11518850941486, (1, 2): 46.361654557652294, (0, 4): 64.49822454430208, (2, 1): 37.294652947208995, (2, 7): 71.31884930507499, (1, 5): 79.64569348621423, (8, 8): 51.73109931662876, (6, 1): 37.22526933032567, (7, 0): 37.29908558822458, (6, 4): 79.96208134752649, (7, 3): 64.38378339850985, (7, 9): 51.63515062834477, (6, 7): 71.52909705147924, (7, 6): 71.6037666010765, (3, 2): 37.21529472183023, (4, 1): 29.90300033148703, (4, 7): 57.24942466484467, (3, 5): 99.50024037525212, (4, 4): 99.50187546410399, (3, 8): 57.25614447733788, (5, 5): 99.40514065110256, (8, 4): 64.31801372707054, (0, 0): 41.55773396442767, (5, 8): 57.30901926222007, (8, 1): 46.37295740578368, (1, 1): 41.5626015183776, (0, 3): 57.81353882364696, (0, 9): 46.27619295995178, (2, 0): 33.465368707503046, (1, 4): 71.88481819474575, (0, 6): 64.12430323875384, (8, 7): 57.651153477926165, (2, 9): 57.24850935593198, (1, 7): 64.12419955940373, (2, 6): 79.71607202444552, (6, 0): 33.4683494865608, (6, 6): 79.87591094828778, (7, 5): 79.57152694202794, (6, 3): 71.75171813607103, (6, 9): 57.31044771948743, (7, 8): 57.5446502393624})
print('Tests passed.')

## MDP Question 2


### Question

The default transition model for the LargeRescueMDP is pretty close
to deterministic. There is a .97 chance that each action succeeds in the intended
direction (unless there's an obstacle in the way or its the edge of the map)
and only a .01 chance of ending up in one of the other 3 neighbouring grid
cells.

What happens if we make the transition model noisier? Setting the
correct_transition_probability to .76 is not too noisy, but it has
implications for our ability to solve for the optimal policy.

Please create a LargeRescueMDP and set the correct_transition_probability to
be .76. You can set the correct_transition_probability field of the
LargeRescueMDP class directly, and the transition function will be adjusted
according. (You can read the comment in the LargeRescueMDP setter function for
more detail on how setting the correct_transition_probability works.)


For reference, our solution is **5** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `bellman_backup`, `value_iteration`. You may not need to use all of them.

In [None]:
def MDP_2():
    """Creates a LargeRescueMDP(), sets the correct_transition_probability to
    .76 and returns the optimal value function.

    Args:
      None

    Returns:
      value function: a dict of states to values
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def vi_test(function, V):
  import random; random.seed(0)
  import numpy.random as npr; npr.seed(0)
  Vsub = function()
  assert check_value_function(Vsub, V)

vi_test(MDP_2, {(4, 0): 17.193656448413645, (3, 4): 82.66821248865962, (4, 9): 33.24527236077266, (3, 7): 48.38047730522361, (5, 4): 81.57596106315178, (8, 0): 29.485520779767217, (0, 2): 39.54500010633741, (8, 3): 46.729294692291475, (8, 9): 34.81853504895561, (0, 5): 57.71031630612815, (2, 2): 28.67493693106001, (1, 0): 25.425642667196094, (1, 6): 58.371006293552576, (0, 8): 38.3562257828573, (2, 5): 79.50785163099987, (8, 6): 51.83291668257821, (2, 8): 48.37609286765003, (7, 4): 60.490236730134086, (7, 1): 29.15147584428534, (7, 7): 51.413594478870216, (6, 5): 78.66538264988357, (6, 8): 49.519712872964426, (4, 2): 21.29292539099053, (3, 0): 19.184941573721872, (4, 5): 0.0, (3, 9): 36.70592766228915, (5, 0): 19.36445038655641, (4, 8): 37.33611818768401, (5, 6): 80.39156750674351, (5, 3): 70.35461243079723, (5, 9): 37.09563198565326, (8, 2): 40.21565223450838, (8, 5): 57.16960149114739, (0, 1): 33.657954212109814, (0, 7): 44.47885137481426, (2, 4): 71.94570365595851, (1, 2): 33.63013421547188, (0, 4): 54.116379728735176, (2, 1): 25.397822670558156, (2, 7): 56.82786088085701, (1, 5): 67.23721597706378, (8, 8): 39.50173149602546, (6, 1): 25.180061683772845, (7, 0): 25.734404278265206, (6, 4): 70.24275282935724, (7, 3): 53.62383581844224, (7, 9): 38.68727231509817, (6, 7): 58.8166743393827, (7, 6): 59.54114719666043, (3, 2): 24.704121310787116, (4, 1): 18.46056909239355, (4, 7): 41.79958505037646, (3, 5): 94.47917738184584, (4, 4): 94.63963415294967, (3, 8): 41.906350224012456, (5, 5): 93.32556960074426, (8, 4): 52.64659733120905, (0, 0): 29.033502757353006, (5, 8): 42.2743189133091, (8, 1): 34.191129369373066, (1, 1): 29.10269089376744, (0, 3): 46.568742068171915, (0, 9): 33.46384586176194, (2, 0): 22.19943115828667, (1, 4): 62.444310480187774, (0, 6): 50.95192651060614, (8, 7): 45.25820887982636, (2, 9): 41.7430747582296, (1, 7): 50.88346010072386, (2, 6): 67.54061291729131, (6, 0): 22.424055347153732, (6, 6): 69.17350279458515, (7, 5): 66.56559260006958, (6, 3): 61.46654014072949, (6, 9): 42.379150021338276, (7, 8): 44.3619253645486})
print('Tests passed.')

## MDP Question 3


### Question

The default temporal_discount_factor is .9. This is in general quite a low
discount factor. An action 20 steps away will have $.9^{20} \approx .12$ impact on later
actions.

What happens if we increase the discount factor? Please create a
LargeRescueMDP, set the temporal_discount_factor to be .99, and also set
the correct_transition_probability to be .76. Then please solve for the
optimal value function.


For reference, our solution is **6** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `bellman_backup`, `value_iteration`. You may not need to use all of them.

In [None]:
def MDP_3():
    """Creates a LargeRescueMDP(), sets the correct_transition_probability to
    .76 and the temporal_discount_factor to .99 and returns the optimal value function.

    Args:
      None

    Returns:
      value function: a dict of states to values
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def vi_test(function, V):
  import random; random.seed(0)
  import numpy.random as npr; npr.seed(0)
  Vsub = function()
  assert check_value_function(Vsub, V)

vi_test(MDP_3, {(4, 0): 84.0088571825647, (3, 4): 98.08785113514071, (4, 9): 89.54219638658633, (3, 7): 92.85684581202082, (5, 4): 97.92940667071218, (8, 0): 88.55907148092525, (0, 2): 91.15195158169959, (8, 3): 92.71010661836499, (8, 9): 90.02820552034618, (0, 5): 94.57557687282643, (2, 2): 88.2596929505786, (1, 0): 87.24220975351437, (1, 6): 94.66146525243018, (0, 8): 90.84234985893639, (2, 5): 97.64083406000782, (8, 6): 93.58873399735727, (2, 8): 92.85540170289251, (7, 4): 95.05380837452687, (7, 1): 88.4404389970416, (7, 7): 93.49470937074302, (6, 5): 97.51833426545349, (6, 8): 93.09749758158448, (4, 2): 85.70758382182571, (3, 0): 84.8709760689106, (4, 5): 0.0, (3, 9): 90.3793968171442, (5, 0): 84.979923830497, (4, 8): 90.54408402590663, (5, 6): 97.7688940677295, (5, 3): 96.51303468508517, (5, 9): 90.49294151104122, (8, 2): 91.34026747888913, (8, 5): 94.47412332570079, (0, 1): 89.68812885428089, (0, 7): 92.17514466317554, (2, 4): 96.75361408485887, (1, 2): 89.67672656160515, (0, 4): 94.06459700794483, (2, 1): 87.23080746083865, (2, 7): 94.36773185735558, (1, 5): 96.0088117943946, (8, 8): 91.131398346243, (6, 1): 87.17982482641227, (7, 0): 87.385190483955, (6, 4): 96.47708368340508, (7, 3): 93.97491750094179, (7, 9): 90.91508992377925, (6, 7): 94.7388560346764, (7, 6): 94.86949260973816, (3, 2): 86.97078685302058, (4, 1): 84.52926790610293, (4, 7): 91.53586068943369, (3, 5): 99.37197204832822, (4, 4): 99.39679280725164, (3, 8): 91.54877634429582, (5, 5): 99.22275748726061, (8, 4): 93.77006553865067, (0, 0): 88.39225620136497, (5, 8): 91.64279788012988, (8, 1): 89.85966251042251, (1, 1): 88.40199915122935, (0, 3): 92.67280996413947, (0, 9): 89.65081472852465, (2, 0): 86.08978494897735, (1, 4): 95.40750777342734, (0, 6): 93.41218842117453, (8, 7): 92.35290042089164, (2, 9): 91.51907133407133, (1, 7): 93.39431118581003, (2, 6): 96.04265401195448, (6, 0): 86.21161978985722, (6, 6): 96.30174912325705, (7, 5): 95.897070647761, (6, 3): 95.24349019068275, (6, 9): 91.67470906173104, (7, 8): 92.13682235192404})
print('Tests passed.')

## MDP Question 4


### Question

Now let's make the dynamics really noisy. Once again, please create a
LargeRescueMDP with a hazard at `{(1, 4)}', but with
correct_transition_probability set to 0.5. You should see a very substantial
change in both the policy and value function.


For reference, our solution is **6** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `bellman_backup`, `value_iteration`. You may not need to use all of them.

In [None]:
def MDP_4():
    """Creates a LargeRescueMDP(), and sets a hazard at `{(1, 4)}' and
    correct_transition_probability to 0.5 and returns
    the optimal value function.

    Args:
      None

    Returns:
      value function: a dict of states to values
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def vi_test(function, V):
  import random; random.seed(0)
  import numpy.random as npr; npr.seed(0)
  Vsub = function()
  assert check_value_function(Vsub, V)

vi_test(MDP_4, {(4, 0): 3.9085329993652533, (3, 4): 58.805202984216365, (4, 9): 11.753646222356124, (3, 7): 14.723408455358632, (5, 4): 61.773838936434885, (8, 0): 10.878573027487132, (0, 2): -0.20781732009289544, (8, 3): 25.36339749449463, (8, 9): 15.238821727444563, (0, 5): 3.0643671550462237, (2, 2): 1.3841713034879826, (1, 0): 1.5389420673207934, (1, 6): 17.433410439493713, (0, 8): 9.345469276586272, (2, 5): 41.64410517348415, (8, 6): 27.34061864611354, (2, 8): 14.623495028538457, (7, 4): 36.3058589431468, (7, 1): 10.407323848359, (7, 7): 26.517051201155294, (6, 5): 53.88967949297427, (6, 8): 22.684327610380464, (4, 2): 2.317116216835473, (3, 0): 2.9670510404427017, (4, 5): 0.0, (3, 9): 10.690151502522395, (5, 0): 5.390778213003905, (4, 8): 13.032557691482545, (5, 6): 58.417081668751784, (5, 3): 48.1633084382923, (5, 9): 14.293773308570756, (8, 2): 19.299393310263866, (8, 5): 30.829638790982713, (0, 1): 0.8020744337498172, (0, 7): 11.988658922265932, (2, 4): 16.3730627200805, (1, 2): 0.9073556276657573, (0, 4): -21.315413465306385, (2, 1): 1.5783110450404056, (2, 7): 19.59916846774346, (1, 5): 3.022657227097599, (8, 8): 18.15310792716475, (6, 1): 8.20816411897359, (7, 0): 8.845736779684062, (6, 4): 47.243646625788344, (7, 3): 31.763122639369513, (7, 9): 16.655193698431397, (6, 7): 32.12324354829552, (7, 6): 33.91437634728944, (3, 2): 1.7861164466125632, (4, 1): 3.0091125874058258, (4, 7): 12.257729404123905, (3, 5): 76.54987697070835, (4, 4): 80.10218352076012, (3, 8): 12.377457524096181, (5, 5): 76.11208979327662, (8, 4): 29.13716588731284, (0, 0): 1.1610960979976075, (5, 8): 16.831639699010523, (8, 1): 13.973656429560908, (1, 1): 1.1974311585861637, (0, 3): -4.701281693660509, (0, 9): 7.646274417336397, (2, 0): 2.1208429059739875, (1, 4): -14.16789428451299, (0, 6): 11.885859730077536, (8, 7): 22.357412587191533, (2, 9): 11.691554230957815, (1, 7): 15.568152460906626, (2, 6): 28.582035217246688, (6, 0): 7.082840364895587, (6, 6): 44.27678096369422, (7, 5): 39.40783614141224, (6, 3): 39.44059244534516, (6, 9): 17.470929083719025, (7, 8): 20.556565474493652})
print('Tests passed.')

## Value from Q


### Utilities

### Mountain Car MDP and Regressors
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from functools import partial


class MountainCar:
    '''
    Mountain Car MDP.

    The state is a tuple of two floats (x, v), denoting the position and 
    velocity of the car.
    '''
    def __init__(self, goal_velocity=0, difficulty='hard', rng=np.random.default_rng(42)):
        self.difficulty = difficulty
        self.min_position = -1.2
        self.max_position = 0.6
        self.max_speed = 0.07
        self.goal_position = 0.5
        self.goal_velocity = goal_velocity

        self.force = 0.001
        self.gravity = 0.0025
        self.force_noise = 0.0002

        self.low = np.array([self.min_position, -self.max_speed], dtype=np.float32)
        self.high = np.array([self.max_position, self.max_speed], dtype=np.float32)
        self.actions = (0, 1, 2)          # (left_acc, none, right_acc)
        self.discount_factor = 1.0
        self.rng = rng

        self.time_factor = 10.

        self.init_state()


    def init_state(self):
        if self.difficulty == 'hard':
            self.state = (self.rng.uniform(-0.6, -0.4), 0.0)
        else:
            self.state = (self.rng.uniform(0.0, 0.5), 0.0)
        return self.state

    def terminal(self, s):
        position, velocity = s
        return bool(position >= self.goal_position and velocity >= self.goal_velocity)

    def sim_transition(self, action: int):
        '''
        Args:
        - action : {0, 1, 2}, indicating left, none, right
        Returns:
        - reward : float
        - state : (x_position : float, velocity : float)
        '''
        position, velocity = self.state
        velocity += ((action - 1) * self.force + np.cos(3 * position) * (-self.gravity) + self.rng.normal(scale=self.force_noise)) * self.time_factor
        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
        position += velocity * self.time_factor
        position = np.clip(position, self.min_position, self.max_position)
        if position == self.min_position and velocity < 0:
            velocity = 0
        reward = -1.0 

        self.state = (position, velocity)

        return reward, self.state


    def sim_episode(self, policy, max_iters = 20):
        traj = []
        s = self.init_state()
        for i in range(max_iters):
            if self.terminal(s):
                for a in self.actions:
                    traj.append((s, a, 0, s))
                return traj
            a = policy(s)
            (r, s_prime) = self.sim_transition(a)
            traj.append((s, a, r, s_prime))
            s = s_prime
        return traj

    def evaluate(self, n_play, traj_length, policy):
        score = 0
        for i in range(n_play):
            score += sum(x[2] for x in self.sim_episode(policy=policy, max_iters=traj_length)) # reward
        return score/n_play


class QFRegressor:
    def __init__(self, mdp, rng=np.random.default_rng(42)):
        self.mdp = mdp
        self.fitted = False
        self.rng = rng
    def fq_q_value(self, s, a):
        raise NotImplementedError('Override me')

    def fq_value(self, s):
        return compute_value_from_q(self.mdp.actions, self.fq_q_value, s)

    def fq_greedy(self, s):
        if not self.fitted:
            return self.rng.choice(self.mdp.actions)
        return greedy_policy_from_q(self.mdp.actions, self.fq_q_value, s)

    def fq_epsilon_greedy(self, s, eps):
        if not self.fitted:
            return self.rng.choice(self.mdp.actions)
        return epsilon_greedy_policy_from_q(self.mdp.actions, self.fq_q_value, s, eps, self.rng)

class NeuralRegressor(QFRegressor):
    def initialize(self, max_iter=1000, hidden_layer_sizes=(40,40)):
        self.fq_models = {
            a: MLPRegressor(hidden_layer_sizes=hidden_layer_sizes,
                            max_iter=max_iter, learning_rate_init=0.03)
            for a in self.mdp.actions
        }
        self.fitted = False
    def fq_q_value(self, s, a):
        return self.fq_models[a].predict(np.array(s).reshape(1,-1))

    def fit(self, a, X, Y):
        self.fq_models[a].fit(X, Y)

class KNNRegressor(QFRegressor):
    def initialize(self, n_neighbors=3):
        self.fq_models = {
            a: KNeighborsRegressor(n_neighbors=n_neighbors)
            for a in self.mdp.actions
        }
        self.fitted = False

    def fq_q_value(self, s, a):
        return self.fq_models[a].predict(np.array(s).reshape(1,-1))

    def fit(self, a, X, Y):
        self.fq_models[a].fit(X, Y)

### Question
Compute the value function from the Q function.

For reference, our solution is **3** line(s) of code.

In [None]:
def compute_value_from_q(action_space, q_function, state):
    """Given the action space, a q_function and a state, compute the value 
    for this state using the q_function.
    Args:
    - action_space : tuple of actions
    - q_function : (state, action) -> q_value : float
    - state : state
    Returns:
    - value : float - the value of this state
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def value_from_q_problem_test():
    actions = (0, 1, 2)
    action_scores = [-3, 1, 5.]
    test_state = 3
    def q(state, action):
        assert(state == test_state)
        assert action in actions
        return action_scores[action]
    def rotate(arr):
        arr.append(arr[0])
        arr.pop(0)
    assert(compute_value_from_q(actions, q, test_state) == 5.)
    rotate(action_scores)
    assert(compute_value_from_q(actions, q, test_state) == 5.)
    rotate(action_scores)
    assert(compute_value_from_q(actions, q, test_state) == 5.)

value_from_q_problem_test()

print('Tests passed.')

## Greedy Policy from Q


### Question
Write the greedy policy given a Q function.

For reference, our solution is **6** line(s) of code.

In [None]:
def greedy_policy_from_q(action_space, q_function, state):
    """Given the action space, a q_function and a state, compute the action
    taken by the greedy policy at this state, under the q_function.

    Args:
    - action_space : tuple of actions
    - q_function : (state, action) -> q_value : float
    - state : state

    Return:
    - action
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def greedy_policy_from_q_test():
    actions = (0, 1, 2)
    action_scores = [-3, 1, 5.]
    test_state = 3
    def q(state, action):
        assert(state == test_state)
        assert action in actions
        return action_scores[action]
    def rotate(arr):
        arr.append(arr[0])
        arr.pop(0)
    assert(greedy_policy_from_q(actions, q, test_state) == 2)
    rotate(action_scores)
    assert(greedy_policy_from_q(actions, q, test_state) == 1)
    rotate(action_scores)
    assert(greedy_policy_from_q(actions, q, test_state) == 0)

greedy_policy_from_q_test()

print('Tests passed.')

## Epsilon Greedy Policy from Q


### Question
Write the epsilon greedy policy given a Q function. With probability epsilon, the policy should pick a random action, and with probability 1-epsilon, pick the greedy action. You may use the `greed_policy_from_q` function. You may find `rng.choice` and `rng.random` useful.

For reference, our solution is **6** line(s) of code.

In addition to all the utilities defined at the top of the Colab notebook, the following functions are available in this question environment: `greedy_policy_from_q`. You may not need to use all of them.

In [None]:
def epsilon_greedy_policy_from_q(action_space, q_function, state, eps, rng):
    """Given the action space, a q_function and a state, compute the action
    taken by an epsilon-greedy policy at this state, under the q_function.

    Args:
    - action_space : tuple of actions
    - q_function : (state, action) -> q_value : float
    - state : state
    - eps : [0, 1]
    - rng : np.random.Generator

    Return:
    - action
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def epsilon_greedy_policy_from_q_test():
    actions = (0, 1, 2)
    action_scores = [-3, 1, 5.]
    test_state = 3
    rng = np.random.default_rng(42)
    eps = 0.5
    N = 1000
    def q(state, action):
        assert(state == test_state)
        assert action in actions
        return action_scores[action]
    def rotate(arr):
        arr.append(arr[0])
        arr.pop(0)
    runs = [epsilon_greedy_policy_from_q(actions, q, test_state, eps, rng) for _ in range(N)]
    counts = [sum([1 for j in runs if j == i]) for i in actions]
    def binary_std(p):
        return (p * (1-p) * N)**.5
    assert(np.abs(counts[0] - N*eps/3) < binary_std(eps/3) * 3)
    assert(np.abs(counts[1] - N*eps/3) < binary_std(eps/3) * 3)
    assert(np.abs(counts[2] - N*eps/3 - N * (1-eps)) < binary_std(eps/3) * 3)
    assert(sum(counts) == N)

epsilon_greedy_policy_from_q_test()

print('Tests passed.')

## Sampling points in grid


### Question
Next, we'd want to sample points in the state space on which to perform Bellman backups. Write a function that samples the state space in an evenly spaced grid, and return the sampled points and their corresponding transitions.

For reference, our solution is **18** line(s) of code.

In [None]:
def sample_grid_points(x_divisions, v_divisions, mdp):
    """Samples x_divisons times v_divisions points in a grid across 
    the state space, defined by [mdp.min_position, mdp.max_position] x
    [-mdp.max_speed, mdp.max_speed]. Then, sample their next transitions using the mdp.sim_transition method. Returns a list of tuples, each 
    tuple describing the sampled state and its transition.

    Args:
    - mdp : MountainCar
    Return:
    - memory : [tuple (state, action, reward, next_state)]
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def sample_grid_points_test():
    mc = MountainCar()
    mc.force_noise = 0.
    results = sample_grid_points(3, 4, mc)
    my_results = [((-1.2, -0.07), 0, -1.0, (-1.2, 0)), ((-1.2, -0.07), 1, -1.0, (-1.2, 0)), ((-1.2, -0.07), 2, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 0, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 1, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 2, -1.0, (-1.1091437292497965, 0.009085627075020343)), ((-1.2, 0.02333333333333333), 0, -1.0, (-0.8424770625831298, 0.035752293741687015)), ((-1.2, 0.02333333333333333), 1, -1.0, (-0.7424770625831298, 0.04575229374168702)), ((-1.2, 0.02333333333333333), 2, -1.0, (-0.6424770625831299, 0.05575229374168701)), ((-1.2, 0.07), 0, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 1, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 2, -1.0, (-0.4999999999999999, 0.07)), ((-0.30000000000000004, -0.07), 0, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 1, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 2, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.023333333333333338), 0, -1.0, (-0.7887358254009995, -0.04887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 1, -1.0, (-0.6887358254009995, -0.03887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 2, -1.0, (-0.5887358254009996, -0.028873582540099946)), ((-0.30000000000000004, 0.02333333333333333), 0, -1.0, (-0.32206915873433284, -0.002206915873433281)), ((-0.30000000000000004, 0.02333333333333333), 1, -1.0, (-0.22206915873433283, 0.007793084126566721)), ((-0.30000000000000004, 0.02333333333333333), 2, -1.0, (-0.12206915873433283, 0.017793084126566723)), ((-0.30000000000000004, 0.07), 0, -1.0, (0.1445975079323339, 0.044459750793233395)), ((-0.30000000000000004, 0.07), 1, -1.0, (0.24459750793233392, 0.0544597507932334)), ((-0.30000000000000004, 0.07), 2, -1.0, (0.3445975079323339, 0.06445975079323339)), ((0.6, -0.07), 0, -1.0, (-0.10000000000000009, -0.07)), ((0.6, -0.07), 1, -1.0, (-0.04319947632672838, -0.06431994763267283)), ((0.6, -0.07), 2, -1.0, (0.0568005236732716, -0.05431994763267284)), ((0.6, -0.023333333333333338), 0, -1.0, (0.3234671903399383, -0.027653280966006166)), ((0.6, -0.023333333333333338), 1, -1.0, (0.42346719033993835, -0.017653280966006164)), ((0.6, -0.023333333333333338), 2, -1.0, (0.5234671903399384, -0.007653280966006166)), ((0.6, 0.02333333333333333), 0, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 1, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 2, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.07), 0, 0.0, (0.6, 0.07)), ((0.6, 0.07), 1, 0.0, (0.6, 0.07)), ((0.6, 0.07), 2, 0.0, (0.6, 0.07))]
    def recur_match(a, b, fn):
        if type(a) is tuple:
            assert type(b) is tuple
            assert len(a) == len(b)
            for x, y in zip(a, b):
                if not recur_match(x, y, fn):
                    False
            return True
        else:
            return fn(a, b)
    for r in results:
        found = False
        for dr in my_results:
            if recur_match(dr, r, lambda x, y: np.abs(x - y) < 1e-6):
              found = True
        assert(found)

sample_grid_points_test()

print('Tests passed.')

## Sampling points from policy


### Question
Another way to sample points is to collect points by rolling out trajectories from a policy. Implement this, using the same output format.

For reference, our solution is **8** line(s) of code.

In [None]:
def sample_policy_points(policy, traj_length, num_traj, mdp):
    """Produce samples in the state space by rolling out a policy. Use `mdp.sim_episode`
    to obtain rollouts.

    Args:
    - policy : state -> action
    - traj_length : int  - length of rollout
    - num_traj : int - number of trajectories to rollout
    - mdp : MountainCar
    Return:
    - memory : [tuple (state, action, reward, next_state)]
    """
    raise NotImplementedError("Implement me!")

### Tests

In [None]:
def sample_policy_points_test():
    rng = np.random.default_rng(3)
    mc = MountainCar(rng=rng)
    mc.state = 0.
    policy = lambda s : 1+np.sign(s[1])
    results = sample_policy_points(policy, 5, 2, mc)
    my_results = [((-1.2, -0.07), 0, -1.0, (-1.2, 0)), ((-1.2, -0.07), 1, -1.0, (-1.2, 0)), ((-1.2, -0.07), 2, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 0, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 1, -1.0, (-1.2, 0)), ((-1.2, -0.023333333333333338), 2, -1.0, (-1.1091437292497965, 0.009085627075020343)), ((-1.2, 0.02333333333333333), 0, -1.0, (-0.8424770625831298, 0.035752293741687015)), ((-1.2, 0.02333333333333333), 1, -1.0, (-0.7424770625831298, 0.04575229374168702)), ((-1.2, 0.02333333333333333), 2, -1.0, (-0.6424770625831299, 0.05575229374168701)), ((-1.2, 0.07), 0, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 1, -1.0, (-0.4999999999999999, 0.07)), ((-1.2, 0.07), 2, -1.0, (-0.4999999999999999, 0.07)), ((-0.30000000000000004, -0.07), 0, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 1, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.07), 2, -1.0, (-1.0, -0.07)), ((-0.30000000000000004, -0.023333333333333338), 0, -1.0, (-0.7887358254009995, -0.04887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 1, -1.0, (-0.6887358254009995, -0.03887358254009995)), ((-0.30000000000000004, -0.023333333333333338), 2, -1.0, (-0.5887358254009996, -0.028873582540099946)), ((-0.30000000000000004, 0.02333333333333333), 0, -1.0, (-0.32206915873433284, -0.002206915873433281)), ((-0.30000000000000004, 0.02333333333333333), 1, -1.0, (-0.22206915873433283, 0.007793084126566721)), ((-0.30000000000000004, 0.02333333333333333), 2, -1.0, (-0.12206915873433283, 0.017793084126566723)), ((-0.30000000000000004, 0.07), 0, -1.0, (0.1445975079323339, 0.044459750793233395)), ((-0.30000000000000004, 0.07), 1, -1.0, (0.24459750793233392, 0.0544597507932334)), ((-0.30000000000000004, 0.07), 2, -1.0, (0.3445975079323339, 0.06445975079323339)), ((0.6, -0.07), 0, -1.0, (-0.10000000000000009, -0.07)), ((0.6, -0.07), 1, -1.0, (-0.04319947632672838, -0.06431994763267283)), ((0.6, -0.07), 2, -1.0, (0.0568005236732716, -0.05431994763267284)), ((0.6, -0.023333333333333338), 0, -1.0, (0.3234671903399383, -0.027653280966006166)), ((0.6, -0.023333333333333338), 1, -1.0, (0.42346719033993835, -0.017653280966006164)), ((0.6, -0.023333333333333338), 2, -1.0, (0.5234671903399384, -0.007653280966006166)), ((0.6, 0.02333333333333333), 0, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 1, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.02333333333333333), 2, 0.0, (0.6, 0.02333333333333333)), ((0.6, 0.07), 0, 0.0, (0.6, 0.07)), ((0.6, 0.07), 1, 0.0, (0.6, 0.07)), ((0.6, 0.07), 2, 0.0, (0.6, 0.07))]
    def recur_match(a, b, fn):
        if type(a) is tuple:
            assert type(b) is tuple
            assert len(a) == len(b)
            for x, y in zip(a, b):
                if not recur_match(x, y, fn):
                    False
            return True
        else:
            return fn(a, b)
    for r in results:
        found = False
        for dr in my_results:
            if recur_match(dr, r, lambda x, y: np.abs(x - y) < 1e-6):
              found = True
        assert(found)

sample_policy_points_test()

print('Tests passed.')

## Fitted Q Visualization


### Question
Complete the `fitted_Q_learn` function given in the colab notebook. You're now ready to conduct fitted Q learning on the Mountain Car Problem! You may use either sampling method (grid or policy), and either regression method (`KNNRegressor` or `NeuralRegressor`).

In [None]:

import matplotlib
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython import display as display
matplotlib.rc('animation', html='jshtml')

def visualize_qf(qf_regressor):
    min_x, max_x = qf_regressor.mdp.min_position, qf_regressor.mdp.max_position
    min_v, max_v = -qf_regressor.mdp.max_speed, qf_regressor.mdp.max_speed
    vf = np.array([
          [
              qf_regressor.fq_value((x, v))
            for x in np.linspace(min_x, max_x, 100)
          ]
        for v in np.linspace(min_v, max_v, 100)
    ])
    im = plt.imshow(vf, extent=(min_x, max_x, min_v, max_v), aspect='auto')
    plt.colorbar(im)

def visualize_traj(traj):
    '''
    Visualizes a trajectory. Call with the output of MountainCar.sim_episode

    Args:
    - traj : [tuple (state, action, reward, next_state)]
    '''
    # based off https://github.com/mpatacchiola/dissecting-reinforcement-learning/blob/master/environments/mountain_car.py#L105
    mode = 'jupyter'
    file_path='./mountain_car.mp4'
    # Plot init

    fig = plt.figure()
    ax = fig.add_subplot(111, autoscale_on=False, xlim=(-1.2, 0.6), ylim=(-1.1, 1.1))
    ax.grid(False)  # disable the grid
    x_sin = np.linspace(start=-1.2, stop=0.6, num=100)
    y_sin = np.sin(3 * x_sin)
    # plt.plot(x, y)
    ax.plot(x_sin, y_sin)  # plot the sine wave
    # line, _ = ax.plot(x, y, 'o-', lw=2)
    dot, = ax.plot([], [], 'ro')
    time_text = ax.text(0.05, 0.9, '', transform=ax.transAxes)
    _position_list = [s[0][0] for s in traj]
    _delta_t = .6

    def _init():
        dot.set_data([], [])
        time_text.set_text('')
        return dot, time_text

    def _animate(i):
        x = _position_list[i]
        y = np.sin(3 * x)
        dot.set_data(x, y)
        time_text.set_text("Time: " + str(np.round(i*_delta_t, 1)) + "s" + '\n' + "Frame: " + str(i))
        return dot, time_text

    ani = animation.FuncAnimation(fig, _animate, np.arange(1, len(_position_list)),
                                    blit=True, init_func=_init, repeat=True, interval=_delta_t * 1000)

    if mode == 'gif':
        ani.save(file_path, writer='imagemagick', fps=int(1/_delta_t))
    elif mode == 'mp4':
        ani.save(file_path, fps=int(1/_delta_t), writer='avconv', codec='libx264')
    elif mode == 'jupyter':
        video = ani.to_jshtml()
        html = display.HTML(video)
        display.display(html)
        plt.close()


def fitted_Q_learn(mdp, sampler, qf_regressor, iters):
    '''
    Takes in a MountainCar instance, a sampling method, a fitted q
    regression method, and the number of iterations. Runs fitted Q
    learning with that many iterations.

    Args:
    - mdp : MountainCar
    - sampler : (mdp : MountainCar) -> memory : [tuple (state, action, reward, next_state)]
    - qf_regressor : QFRegressor
    - iters : int
    '''
    PRINT_EPOCH = 6
    qf_regressor.initialize()

    for it in range(iters):
        Xd = dict([(a, []) for a in mdp.actions])
        Yd = dict([(a, []) for a in mdp.actions])
        memory = sampler(mdp)
        for (s, a, r, s_prime) in memory:
            if it == 0 or mdp.terminal(s):
                # TODO: IMPLEMENT ME
                raise NotImplementedError('Set v = something here')
            else:
                # TODO: IMPLEMENT ME. You may find mdp.discount_factor 
                # and qf_regressor.fq_value useful.
                raise NotImplementedError('Set v = something here')
            Xd[a].append(s)
            Yd[a].append(np.array([v]))
        for a in mdp.actions:
            X = np.vstack(Xd[a])
            Y = np.vstack(Yd[a])
            Y = Y[:, 0]
            qf_regressor.fit(a, X, Y)
        qf_regressor.fitted = True
        print(f'Iteration {it}:  {mdp.evaluate(n_play=10, traj_length=100, policy=qf_regressor.fq_greedy)}')
        if it % PRINT_EPOCH == PRINT_EPOCH-1:
            visualize_qf(qf_regressor)
            plt.show()

This shows an example of running fitted Q learning using KNNRegressor with policy sampling. Try it with different different regressors, sampling methods and sampling parameters!

In [None]:
def run_fitted_q_example():
    NUM_ITERS = 10
    TRAJ_LENGTH = 40
    NUM_ROLLOUTS = 20
    EPSILON = 0.4
    mc = MountainCar()
    qf_regressor = KNNRegressor(mc)
    sampler = partial(sample_policy_points, lambda s: qf_regressor.fq_epsilon_greedy(s, EPSILON), TRAJ_LENGTH, NUM_ROLLOUTS)
    fitted_Q_learn(
        mdp=mc, 
        sampler=sampler,
        qf_regressor=qf_regressor,
        iters=NUM_ITERS)
    print('expected reward =', mc.evaluate(100, 100, qf_regressor.fq_greedy))
    visualize_traj(mc.sim_episode(policy=qf_regressor.fq_greedy, max_iters=100))
    plt.show()
    visualize_qf(qf_regressor)
    plt.show()