# Homework 8

## Imports and Utilities
**Note**: these imports and functions are available in catsoop. You do not need to copy them in.

In [None]:
from collections import defaultdict
from math import sqrt, log
import abc
import numpy as np


class MDP:
    """A Markov Decision Process."""

    @property
    @abc.abstractmethod
    def state_space(self):
        """Representation of the MDP state set.
        """
        raise NotImplementedError("Override me")

    @property
    @abc.abstractmethod
    def action_space(self):
        """Representation of the MDP action set.
        """
        raise NotImplementedError("Override me")

    @property
    def temporal_discount_factor(self):
        """Gamma, defaults to 1.
        """
        return 1.

    @property
    def horizon(self):
        """H, defaults to inf.
        """
        return float("inf")

    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states.

        Defaults to False.

        Args:
            state: A state.

        Returns:
            state_is_terminal : A bool.
        """
        return False

    @abc.abstractmethod
    def get_reward(self, state, action, next_state):
        """Return (deterministic) reward for executing action
        in state.

        Args:
            state: A current state.
            action: An action.
            next_state: A next state.

        Returns:
            reward : Single time step reward.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_transition_distribution(self, state, action):
        """Return a distribution over next states.

        The form of this distribution will vary, e.g., depending
        on whether the MDP has discrete or continuous states.

        Args:
            state: A current state.
            action: An action.

        Returns:
            next_state_distribution: Distribution over next states.
        """
        raise NotImplementedError("Override me")

    def sample_next_state(self, state, action, rng=np.random):
        """Sample a next state from the transition distribution.

        This function may be overwritten by subclasses when the explicit
        distribution is too large to enumerate.

        Args:
            state: A state from the state space.
            action: An action from the action space.
            rng: A random number generator.

        Returns:
            next_state: A sampled next state from the state space.
        """
        next_state_dist = self.get_transition_distribution(state, action)
        next_states, probs = zip(*next_state_dist.items())
        next_state_index = rng.choice(len(next_states), p=probs)
        next_state = next_states[next_state_index]
        return next_state


class SingleRowMDP(MDP):
    """A 1D grid MDP for debugging. The grid is 1x5
    and the agent is meant to start off in the middle.
    There is +10 reward on the rightmost square, -10 on
    the left. Actions are left and right. An action effect
    is reversed with 10% probability.
    """
    @property
    def state_space(self):
        return {0, 1, 2, 3, 4}  # position in grid

    @property
    def action_space(self):
        return {0, 1}  # left, right

    def get_transition_distribution(self, state, action):
        # Discrete distributions, represented with a dict
        # mapping next states to probs.
        delta = 1 if action == 1 else -1
        intended_effect = min(max(state + delta, 0), 4)
        opposite_effect = min(max(state - delta, 0), 4)
        assert (intended_effect != opposite_effect)
        return {intended_effect: 0.9, opposite_effect: 0.1}

    def get_reward(self, state, action, next_state):
        if next_state == 0:
          return -10
        if next_state == 4:
          return 10
        return -1  # living penalty

    def state_is_terminal(self, state):
        return state in {0, 4}


class MarshmallowMDP(MDP):
    """The Marshmallow MDP described in lecture."""

    @property
    def state_space(self):
        # (hunger level, marshmallow remains)
        return {(h, m) for h in {0, 1, 2} for m in {True, False}}

    @property
    def action_space(self):
        return {"eat", "wait"}

    @property
    def horizon(self):
        return 4

    def get_reward(self, state, action, next_state):
        next_hunger_level = next_state[0]
        return -(next_hunger_level**2)

    def get_transition_distribution(self, state, action):
        # Update marshmallow deterministically
        if action == "eat":
            next_m = False
        else:
            next_m = state[1]

        # Initialize next state distribution dict
        # Any state not included assumed to have 0 prob
        dist = defaultdict(float)

        # Update hunger
        if action == "wait" or state[1] == False:
            # With 0.75 probability, hunger stays the same
            dist[(state[0], next_m)] += 0.75
            # With 0.25 probability, hunger increases by 1
            dist[(min(state[0] + 1, 2), next_m)] += 0.25

        else:
            assert action == "eat" and state[1] == True
            # Hunger deterministically set to 1 after eating
            dist[(0, next_m)] = 1.0

        return dist


class ZitsMDP(MDP):
    """The Zits MDP described in lecture."""

    @property
    def state_space(self):
        return {0, 1, 2, 3, 4}

    @property
    def action_space(self):
        return {"apply", "sleep"}

    @property
    def temporal_discount_factor(self):
        return 0.9

    def get_reward(self, state, action, next_state):
        if action == "apply":
            return -1 - next_state
        assert action == "sleep"
        return -next_state

    def get_transition_distribution(self, state, action):
        if action == "apply":
            return {
                0: 0.8,
                4: 0.2
            }
        assert action == "sleep"
        return {
            min(state + 1, 4): 0.4,
            max(state - 1, 0): 0.6
        }


class ChaseMDP(MDP):
    """A 2D grid bunny chasing MDP."""

    @property
    def obstacles(self):
        return np.zeros((2, 3))  # by default, 2x3 grid with no obstacles

    @property
    def goal_reward(self):
        return 1

    @property
    def living_reward(self):
        return 0

    @property
    def height(self):
        return self.obstacles.shape[0]

    @property
    def width(self):
        return self.obstacles.shape[1]

    @property
    def state_space(self):
        pos = [(r, c) for r in range(self.height) for c in range(self.width)]
        return {(p1, p2) for p1 in pos for p2 in pos}

    @property
    def action_space(self):
        return {'up', 'down', 'left', 'right'}

    @property
    def temporal_discount_factor(self):
        return 0.9

    def action_to_delta(self, action):
        return {
            'up': (-1, 0),  # up,
            'down': (1, 0),  # down,
            'left': (0, -1),  # left,
            'right': (0, 1),  # right,
        }[action]

    def get_transition_distribution(self, state, action):
        # Discrete distributions, represented with a dict
        # mapping next states to probs.
        next_state_dist = defaultdict(float)

        agent_pos, goal_pos = state

        # Get next agent state
        row, col = agent_pos
        dr, dc = self.action_to_delta(action)
        r, c = row + dr, col + dc
        # Stay in place if out of bounds or obstacle
        if not (0 <= r < self.height and 0 <= c < self.width):
            r, c = row, col
        elif self.obstacles[r, c]:
            r, c = row, col
        next_agent_pos = (r, c)

        # Get next bunny state
        # Stay in same place with probability 0.5
        next_state_dist[(next_agent_pos, goal_pos)] += 0.5
        # Otherwise move
        row, col = goal_pos
        for (dr, dc) in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
            r, c = row + dr, col + dc
            # Stay in place if out of bounds or obstacle
            if not (0 <= r < self.height and 0 <= c < self.width):
                r, c = row, col
            elif self.obstacles[r, c]:
                r, c = row, col
            next_goal_pos = (r, c)
            next_state_dist[(next_agent_pos, next_goal_pos)] += 0.5*0.25

        return next_state_dist

    def get_reward(self, state, action, next_state):
        agent_pos, goal_pos = next_state
        if agent_pos == goal_pos:
            return self.goal_reward
        return self.living_reward

    def state_is_terminal(self, state):
        agent_pos, goal_pos = state
        return agent_pos == goal_pos


class LargeChaseMDP(ChaseMDP):
    """A larger 2D grid bunny chasing MDP."""

    @property
    def obstacles(self):
        return np.array([
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 1, 0, 0, 0, 0, 1, 1],
            [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
            [0, 1, 0, 1, 1, 0, 1, 0, 0, 0],
            [0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
            [0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ])


def rollout_Q(mdp, initial_state, Q, nr_simulations=10, max_steps=10):
    """
    Simulate the policy for an MDP, given by a Q function.
    At each step, we will greedily choose the action with the maximum Q value.
    If the Q value hasn't been computed for this state, we will randomly choose an action.

    Args:
        mdp: An MDP instance.
        initial_state: The initial state of the MDP.
        Q: A dict, Q[state, action] -> Value
        nr_simulations: An int. The number of simulations.
        max_steps: An int. The maximum depth of the unrolling.

    Return:
        The average total reward over multiple simulations.
    """

    total_reward = 0
    for _ in range(nr_simulations):
        s = initial_state
        for j in range(max_steps):
            if mdp.state_is_terminal(s):
                break
            not_initialized = any((s, a) not in Q for a in mdp.action_space)
            if not_initialized:
                a = np.random.choice(list(mdp.action_space))
            else:
                q_values = [(Q[s, a], a) for a in mdp.action_space]
                a = max(q_values)[1]
            ns = mdp.sample_next_state(s, a)
            r = mdp.get_reward(s, a, ns)
            total_reward += r
            s = ns
    return total_reward / nr_simulations


def rollout_policy(mdp, initial_state, policy, nr_simulations=10, max_steps=10):
    """
    Simulate the policy for an MDP, given by a policy.

    Args:
        mdp: An MDP instance.
        initial_state: The initial state of the MDP.
        policy: a Callable function, mapping from (mdp, current_state) to action.
        Q: A dict, Q[state, action] -> Value
        nr_simulations: An int. The number of simulations.
        max_steps: An int. The maximum depth of the unrolling.

    Return:
        The average total reward over multiple simulations.
    """

    total_reward = 0
    for _ in range(nr_simulations):
        s = initial_state
        for j in range(max_steps):
            if mdp.state_is_terminal(s):
                break
            a = policy(mdp, s)
            ns = mdp.sample_next_state(s, a)
            r = mdp.get_reward(s, a, ns)
            total_reward += r
            s = ns
    return total_reward / nr_simulations



## Problems

### Monte-Carlo Tree Search
Complete the implementation of the Monte-Carlo Tree Search (MCTS) for an infinite or indefinite horizon MDP.

For reference, our solution is **61** lines of code.

In [None]:
def mcts(mdp, initial_state, exploration_factor=1.0, iterations=100, max_depth=10):
  """Monte-Carlo Tree Search (MCTS) for solving an MDP.

  Typically, an MCTS procedure keeps track of the running time of the algorithm
  to determine when to return. Here, to simplify your implementation, your code
  should run the simulation for `iterations` steps.

  Your code should also take care of the maximum rollout steps to avoid infinite
  looping. That is, the maximum depth of the search tree.
  A typical way to handle this is to return 0 in the `simulate` function
  when the maximum depth is reached.

  You should implement the UCT (i.e., MCTS + UCB) for exploration.
  The term `exploration_factor` is used to balance the Q value of a state-action
  pair and the UCB term for that pair.

  Args:
      mdp: an MDP.
      initial_state: the initial state (i.e., the root of the search tree).
      exploration_factor: a floating-point number. It is the scalar hyperparameter
          applied to the UCB term when choosing which action to expand.
          In the lecture notes, this is refered to as c.
      iterations: the number of iterations of `simulate`.
      max_depth: the maximum depth during rolling-out.
  """
  raise NotImplementedError("Implement me!")

Tests

In [None]:
def test1_mcts():
    mdp = SingleRowMDP()
    initial_state = 2
    Q = mcts(mdp, initial_state)
    for i in range(1, 4 + 1):
        assert (i, 0) in Q and (i, 1) in Q
        assert Q[i, 1] > Q[i, 0]

test1_mcts()
print('Tests passed.')