# Homework 9

## Imports and Utilities
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
from collections import defaultdict
from math import sqrt, log
import abc
import numpy as np
import functools


class MDP:
    """A Markov Decision Process."""

    @property
    @abc.abstractmethod
    def state_space(self):
        """Representation of the MDP state set.
        """
        raise NotImplementedError("Override me")

    @property
    @abc.abstractmethod
    def action_space(self):
        """Representation of the MDP action set.
        """
        raise NotImplementedError("Override me")

    @property
    def temporal_discount_factor(self):
        """Gamma, defaults to 1.
        """
        return 1.

    @property
    def horizon(self):
        """H, defaults to inf.
        """
        return float("inf")

    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states.

        Defaults to False.

        Args:
            state: A state.

        Returns:
            state_is_terminal : A bool.
        """
        return False

    @abc.abstractmethod
    def get_reward(self, state, action, next_state=None):
        """Return (deterministic) reward for executing action
        in state.

        Args:
            state: A current state.
            action: An action.
            next_state: Optional. A next state.

        Returns:
            reward : Single time step reward.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_transition_distribution(self, state, action):
        """Return a distribution over next states.

        The form of this distribution will vary, e.g., depending
        on whether the MDP has discrete or continuous states.

        Args:
            state: A current state.
            action: An action.

        Returns:
            next_state_distribution: Distribution over next states.
        """
        raise NotImplementedError("Override me")

    def sample_next_state(self, state, action, rng=np.random):
        """Sample a next state from the transition distribution.

        This function may be overwritten by subclasses when the explicit
        distribution is too large to enumerate.

        Args:
            state: A state from the state space.
            action: An action from the action space.
            rng: A random number generator.

        Returns:
            next_state: A sampled next state from the state space.
        """
        next_state_dist = self.get_transition_distribution(state, action)
        next_states, probs = zip(*next_state_dist.items())
        next_state_index = rng.choice(len(next_states), p=probs)
        next_state = next_states[next_state_index]
        return next_state


class POMDP(MDP):
    """A partially observable Markov decision process (POMDP)."""

    @property
    @abc.abstractmethod
    def observation_space(self):
        """Representation of the POMDP observation space.
        """
        raise NotImplementedError("Override me")

    @abc.abstractclassmethod
    def get_observation_distribution(self, next_state, action):
        """Return a distribution over the observations.

        The form of this distribution will vary, e.g., depending
        on whether the MDP has discrete or continuous observation
        spaces.

        Args:
            next_state: The next state.
            action: The action taken.

        Returns:
            observation_distribution: Distribution over the observation.
        """
        raise NotImplementedError("Override me")


class LambdaMDP(MDP):
    """A helper class that creates a MDP class based on a set of functions.
    See the constructor for details.
    """

    def __init__(self, state_space, action_space, state_is_terminal_fn, get_reward_fn, get_transition_distribution_fn, temporal_discount_factor=1.0):
        """
        Construct a MDP class based on a set of function definitions.

        Args:
            state_space: The set of possible states.
            action_space: The set of possible actions.
            state_is_terminal_fn: A callable function: state_is_terminal_fn(state) -> bool,
                mapping a state to a boolean value indicating whether
                the state is a terminal state.
            get_reward_fn: A callable function: get_reward_fn(state, action, next_state) -> float,
                mapping a (s, a, s') tuple to a float reward value.
            get_transition_distribution_fn: A callable function:
                get_transition_distribution_fn(state, action) -> distribution of the next state.
                Note that the return value for this function must be a discrete distribution.
            temporal_discount_factor: A float number, the temporal discount factor of the MDP.
        """
        super().__init__()
        self.state_space_v = state_space
        self.action_space_v = action_space
        self.state_is_terminal = state_is_terminal_fn
        self.get_reward = get_reward_fn
        self.get_transition_distribution = get_transition_distribution_fn
        self.temporal_discount_factor_v = temporal_discount_factor

    @property
    def state_space(self):
        return self.state_space_v

    @property
    def action_space(self):
        return self.action_space_v

    @property
    def temporal_discount_factor(self):
        return self.temporal_discount_factor_v


class DiscreteDistribution(object):
    """A discrete distribution, represneted as a dictionary."""

    eps = 1e-6

    def __init__(self, prob_dict, support=None):
        """Construct a discrete distribution based on the support set
        and the probability dictionary. The dictionary might be sparse,
        in which case the omitted entries are treated as zero-probability
        values.

        The support argument denotes the set of possible values for the
        random variable. It can be left as optional, especially when the underlying
        support set is a continuous set (e.g., all real numbers). Note
        that, even in this case, we can still define a "discrete distribution",
        that is, a distribution only has mass on a finite set of points.
        For example, we can define a distribution on R: {0: 0.5, 1: 0.5}.
        Implicitly, all values not in the prob_dict will be treated as
        zero-probability.

        Example:

        ```
        p = DiscreteDistribution({'x': 0.3, 'y': 0.3, 'z': 0.4}, {'x', 'y', 'z'})
        print(p.p('x'))  # 0.3
        for x in p:  # iterate over the set of possible values.
            print(p.p(x))  # should print x, 0.3, y, 0.3, z, 0.4
        for x, p_x in p.items():  # just like iterating over a Python dict.
            print(x, p_x)  # should print x, 0.3, y, 0.3, z, 0.4
        ```

        Args:
            prob_dict: A dictionary, mapping elements in support to a float
                number. The dictionary might be sparse. It should always
                sum up to one (thus being a valid distribution.)
            support (Optional): A set of objects.
        """
        assert type(prob_dict) is dict and type(support) in (type(None), set)

        self.prob_dict = prob_dict
        self.support = support

        if self.support is not None:
            for k in self.prob_dict:
                assert k in self.support

    def __iter__(self):
        """Iterate over the support set."""
        yield from self.prob_dict

    def items(self):
        """Iterate over the distribution. Generates a list of (x, p(x)) pairs.
        This function will ignore zero-probability values in the support.
        """
        yield from self.prob_dict.items()

    def p(self, value):
        """Evaluate the proabbility of a value in the support set.

        Args:
            value: An object in the support set.

        Returns:
            p: A float, indicating p(value).
        """
        if self.support is not None:
            assert value in self.support
        return self.prob_dict.get(value, 0.)

    def renormalize(self):
        """Renormalize the distribution to ensure that the probabilities sum up to 1.

        Returns:
            self
        """
        z = sum(self.prob_dict.values())
        assert z > 0, 'Degenerated probability distribution.'
        self.prob_dict = {k: v / z for k, v in self.prob_dict.items()}
        return self

    def check_normalization(self):
        """Check if the prob dict is correctly normalized (i.e., should sum up to 1)."""
        assert 1 - type(self).eps < sum(self.prob_dict.values()) < 1 + type(self).eps

    def max(self):
        """Return max_x p(x) and argmax_x p(x).

        Returns:
            p_max: A float, max_x p(x).
            arg_max: An object in the support, argmax_x p(x).
        """
        return max((v, k) for k, v in self.prob_dict.items())

    def draw(self, rng=None):
        if rng is None:
            rng = np.random
        keys = list(self.prob_dict.keys())
        probs = [self.prob_dict[k] for k in keys]
        return keys[rng.choice(len(keys), p=probs)]

    def __str__(self) -> str:
        return str(self.prob_dict)


def OnehotDiscreteDistribution(support, obj):
    """Create a DiscreteDistribution of support. p(obj) = 1."""
    return DiscreteDistribution({obj: 1.0}, support=support)


def UniformDiscreteDistribution(support):
    """Create a DiscreteDistribution that is uniform. That is, for any object x, p(x) = 1 / |support|."""
    return DiscreteDistribution({x: 1 / len(support) for x in support}, support=support)

# Our RobotChargingPOMDP

class RobotChargingPOMDP(POMDP):
    DEF_MOVE_SUCCESS = 0.8
    DEF_OBS_IF_THERE = 0.9
    DEF_OBS_IF_NOT_THERE = 0.4
    DEF_C_MOVE = 0.5
    DEF_C_LOOK = 0.1
    DEF_GAMMA = 0.9

    def __init__(
        self,
        p_move_success=DEF_MOVE_SUCCESS, p_obs_if_there=DEF_OBS_IF_THERE, p_obs_if_not_there=DEF_OBS_IF_NOT_THERE,
        c_move=DEF_C_MOVE, c_look=DEF_C_LOOK,
        gamma=DEF_GAMMA
    ):
        """
        Create the Robot Chargin POMDP.

        Args:
            p_move_success (float): the probability that a move action is successful.
            p_obs_if_there (float): the probability of return 1 when looking at a location with the charger.
            p_obs_if_not_there (float): the probability of return 1 when looking at a location without a charger.
            c_move (float): the cost of a move action.
            c_look (float): the cost of a look action.
            gamma (float): the temporal discount factor.
        """
        super().__init__()
        self.p_move_success = p_move_success
        self.p_obs_if_there = p_obs_if_there
        self.p_obs_if_not_there = p_obs_if_not_there
        self.c_move = c_move
        self.c_look = c_look
        self.gamma = gamma

    @property
    def state_space(self):
        """
        Three "normal" states: 0, 1, 2, indicating the position of the charger.
        One "terminal" state T. Executing the "charge" action will reach this
        terminal state. And the state is absorbing. The robot will deterministically
        transition to this terminal state when we execute the c action.
        """
        return {0, 1, 2, 'T'}

    @property
    def action_space(self):
        # lx: look(x)
        # mxy: move(start=x, target=y)
        # c: charge
        # nop: NOP
        return {'l0', 'l1', 'l2', 'm01', 'm12', 'm20', 'c', 'nop'}

    @property
    def observation_space(self):
        return {0, 1}

    @property
    def temporal_discount_factor(self):
        return self.gamma

    def state_is_terminal(self, state):
        return state == 'T'

    def get_reward(self, state, action, next_state=None):
        if action == 'nop':
            return 0
        elif action == 'c':
            if state == 0:
                return 10
            else:
                return -100
        elif action.startswith('m'):
            return -self.c_move
        else:  # look
            return -self.c_look

    def get_transition_distribution(self, state, action):
        if action == 'c':
            return OnehotDiscreteDistribution(self.state_space, 'T')
        elif action.startswith('m'):
            start, target = int(action[1]), int(action[2])
            if state == start:
                return DiscreteDistribution({target : self.p_move_success, start : 1 - self.p_move_success}, support=self.state_space)
        return OnehotDiscreteDistribution(self.state_space, state)

    def get_observation_distribution(self, next_state, action):
        if action.startswith('l'):
            target = int(action[1])
            if next_state == target:
                return DiscreteDistribution({0: 1 - self.p_obs_if_there, 1: self.p_obs_if_there}, support=self.observation_space)
            else:
                return DiscreteDistribution({0: 1 - self.p_obs_if_not_there, 1: self.p_obs_if_not_there}, support=self.observation_space)
        return OnehotDiscreteDistribution(self.observation_space, 0)


def expectimax_search(initial_state, mdp, horizon, most_likely_state=False, return_Q=False):
    '''Use expectimax search to determine a next action.

    Note that we're just computing the single next action to
    take, we do not need to store the entire partial V.

    Horizon is given as a separate argument so that we can use
    expectimax search with receding horizon control, for example,
    even if mdp.horizon is inf.

    Args:
        initial_state: A state in the mdp.
        mdp: An MDP.
        horizon: An int horizon.
        most_likely_state: A boolean value.
            If true, compute Q based on the most likely state.
        return_Q: A boolean value. If true, also return the Q value
            at the root instead of the action.

    Returns:
        action: An action in the mdp.
        Q: The Q value at the root state (only when return_Q is True).
    '''
    A = mdp.action_space
    R = mdp.get_reward
    P = mdp.get_transition_distribution
    gm = mdp.temporal_discount_factor
    ts = mdp.state_is_terminal

    @functools.lru_cache(maxsize=None)
    def V(s, h):
        if h == horizon or ts(s):
            return 0
        return max(Q(s, a, h) for a in A)

    def Q(s, a, h):
        psa = P(s, a)
        if most_likely_state:
            ns = max((v, k) for k, v in psa.items())[1]
            return R(s, a, ns) + gm * V(ns, h+1)
        else:
            next_states = psa
            return sum(psa.p(ns) * (R(s, a, ns) + gm * V(ns, h+1)) for ns in next_states)

    Q_values = {a: Q(initial_state, a, 0) for a in A}
    if return_Q:
        return max(A, key=Q_values.get), Q_values
    return max(A, key=Q_values.get)



## Problems

### Transition Update
Complete the implementation of the transition update function over beliefs.

For reference, our solution is **8** lines of code.

In [None]:
def transition_update(pomdp, prior, action):
  """Compute p(s') from a prior distribution of p(s) based on the transition
  distribution p(s, action, s').

  Args:
      pomdp (POMDP): A POMDP object.
      prior (DiscreteDistribution): A distribution over the current state s.
      action: The action to be executed.

  Returns:
      updated_prior (DiscreteDistribution): A distribution over the next state s'.
  """
  raise NotImplementedError("Implement me!")

Tests

In [None]:
def test1_transition_update():
    pomdp = RobotChargingPOMDP()
    b0 = DiscreteDistribution({0: 0.4, 1: 0.3, 2: 0.3}, pomdp.state_space)
    b1 = transition_update(pomdp, b0, 'm01')
    assert np.allclose(b1.p(0), 0.08)
    assert np.allclose(b1.p(1), 0.62)
    assert np.allclose(b1.p(2), 0.3)

test1_transition_update()
print('Tests passed.')

### Observation Update
Complete the implementation of the observation update function over beliefs.

For reference, our solution is **4** lines of code.

In [None]:
def observation_update(pomdp, prior, action, observation):
  """Compute p(s' | observation, action) following the Bayes rule.
      p(s' | o, a) is proportional to p(s' | a) * p(o | s', a).

  Args:
      pomdp (POMDP): A POMDP object.
      prior (DiscreteDistribution): The prior distribution over the next state: p(s' | a).
          Typically, this is the output of the transition_update() function.
      action: The action taken.
      observation: The observation.

  Returns:
      posterior (DiscreteDistribution): The posterior distribution over the next state s'.
  """
  raise NotImplementedError("Implement me!")

Tests

In [None]:
def test1_observation_update():
    pomdp = RobotChargingPOMDP()
    b0 = DiscreteDistribution({0: 0.4, 1: 0.3, 2: 0.3}, pomdp.state_space)
    b1 = transition_update(pomdp, b0, 'l0')
    b1 = observation_update(pomdp, b1, 'l0', 0)
    assert np.allclose(b1.p(0), 0.1)
    assert np.allclose(b1.p(1), 0.45)
    assert np.allclose(b1.p(2), 0.45)

test1_observation_update()
print('Tests passed.')

### Belief Filter
Complete the implementation of the belief filter function.

For reference, our solution is **5** lines of code.

In [None]:
def belief_filter(pomdp, belief, action, observation):
  """Compute the updated belief over the states based on the current action and obervation.

  Specifically, the process is:
      1. the agent is at state s, and has a belief about its current state p(s).
      2. the agent takes an action a, and has a belief about its next state p(s' | a),
          computed by transition_update.
      3. the agent observes o, which follows the observation model of the POMDP p(o | s', a).
      4. the agent updates its belief over the next state p(s' | o, a), following the Bayes rule.

  Args:
      pomdp (POMDP): A POMDP object.
      belief (DiscreteDistribution): The belief about the agent's current state.
      action: The action taken.
      observation: The observation.

  Returns:
      next_belief: The belief about the next state by taking into consideration the action
          at this step and the observation.
  """
  raise NotImplementedError("Implement me!")

Tests

In [None]:
def test1_belief_filter():
    pomdp = RobotChargingPOMDP()
    b0 = DiscreteDistribution({0: 0.9, 1: 0.05, 2: 0.05}, pomdp.state_space)
    b1 = belief_filter(pomdp, b0, 'l0', 0)
    assert np.allclose(b1.p(0), 0.6)
    assert np.allclose(b1.p(1), 0.2)
    assert np.allclose(b1.p(2), 0.2)

test1_belief_filter()
print('Tests passed.')

### Belief-Space MDP
In this seciton, you will implement a function `create_belief_mdp`, that transforms a POMDP into a belief-space MDP.
    We have provided the basic skeleton for you. In particular, you only need to implement the get_reawrd and the get_transition_distribution
    function for the Belief MDP.

For reference, our solution is **79** lines of code.

In [None]:
def create_belief_mdp(pomdp):
  """Constructs a belief-space MDP from a POMDP.

  Args:
      pomdp: The input POMDP object.

  Returns:
      belief_mdp: The constructed belief-space MDP.
  """
    def state_is_terminal(belief):
        """The state_is_terminal function for the belief-space MDP. It returns true iff. all possible states
        in the belief are terminal states.

        Args:
            belief: A DiscreteDistribution of the state.

        Returns:
            is_terminal: Whether the current belief is a "terminal" belief.
        """
        for state, p in belief.items():
            if p > 0 and not pomdp.state_is_terminal(state):
                return False
        return True

    def get_reward(belief, action, next_belief=None):
        """Compute the expected reward function for the belief-space MDP.

        You only need to implement the case where the original reward function only
        depends on the state and the action (but not the next state).

        In this case, the reward function of the belief-space MDP will be only a function
        of belief and action, but not next_belief.

        In general (where the reward function if a function of state, action, and next_action),
        in order to compute the expected reward, we need to also marginalize over the next state
        distribution (which is next_belief).

        Args:
            belief: A DiscreteDistribution of the state.
            action: An action.
            next_belief: A DiscreteDistribution of the next state. Should be ignored (see above).

        Returns:
            reward: the expected reward at this step.
        """
        raise NotImplementedError()

    def get_transition_distribution(belief, action):
        """Compute the transition distribution for an input belief and an action.

        Specifically, the output will be a distribution over beliefs. That is, a distribution over
        distributions. Since we have restricted our observation space to be finite, the
        possible next belief is also a finite space. Thus, we can still use a DiscreteDistribution
        object to represent the distribution over the next belief.

        Args:
            belief: A DiscreteDistribution of the state.
            action: An action.

        Returns:
            next_belief: A DiscreteDistribution of the next state.
        """
        raise NotImplementedError()

    # Construct a new MDP based on the functions defined above.
    return LambdaMDP(
        state_space=None,  # We are not going to specify the state space explicitly (it's a continuous space).
        action_space=pomdp.action_space,
        state_is_terminal_fn=state_is_terminal,
        get_reward_fn=get_reward,
        get_transition_distribution_fn=get_transition_distribution,
        temporal_discount_factor=pomdp.temporal_discount_factor
    )

Tests

In [None]:
def test1_create_belief_mdp():
    pomdp = RobotChargingPOMDP()
    belief_mdp = create_belief_mdp(pomdp)
    b4 = DiscreteDistribution({0 : .4, 1: .3, 2: .3}, support=pomdp.state_space)
    a, Q = expectimax_search(b4, belief_mdp, 4, return_Q=True)
    print(a, Q)

    assert a == 'l1'
    gt = {'l0': -0.1, 'l1': 0.255914, 'l2': -0.1, 'm12': -0.471128, 'm01': -0.5, 'm20': -0.031586, 'c': -56.0, 'nop': 0.0}
    for k, v in gt.items():
        assert k in Q and np.allclose(v, Q[k])

test1_create_belief_mdp()
print('Tests passed.')

### Receding Horizon Control
Use the RHC implementation provided in the colab and answer the questions in catsoop.

A simple implmentation of the Receding Horizon Control (RHC).

In [None]:
def receding_horizon_control(pomdp, initial_belief, h=4, n=4, mlo=False):
    """
    Simulate a receding horizon control.

    Specifically, the function simulates the RHC for n time steps.
    We will keep track of two variables.
        - robot_b, which is the current belief.
        - real_s: the "true" state of the world.

    Args:
        pomdp: The input POMDP problem.
        initial_belief (DiscreteDistribution): a distribution of the state (the initial belief).
        h (int): The receding horizon.
        n (int): The nunber of simulation steps.
        mlo (bool): Use the Most-Likely-Observation approximation during the test. This is equivalent
            to using most-likely-state approximation on the Belief MDP! (Think about why)
    """

    # Create the Belief MDP.
    bmdp = create_belief_mdp(pomdp)

    robot_b = initial_belief
    real_s = robot_b.draw()

    print('Robot belief:', robot_b)
    print('Real state:', real_s)
    print('')

    for t in range(n):
        print('Step', t)
        # expectimax returns the optimal action at the current belief.
        a = expectimax_search(robot_b, bmdp, h, most_likely_state=mlo)

        print('  Executing:', a)
        real_s = pomdp.get_transition_distribution(real_s, a).draw()
        print('  Real State:', real_s)
        if pomdp.state_is_terminal(real_s):
            print('Terminated.')
            break
        o = pomdp.get_observation_distribution(real_s, a).draw()
        print('  Observation:', o)
        robot_b = belief_filter(pomdp, robot_b, a, o)
        print('  Robot belief:', robot_b)