# HW 3.1 Solutions

See pset 1 for dependency installation instructions and see the problem set for deliverables.

In [None]:
# Install dependencies (run this once ever 12 hours)
!pip install --upgrade git+https://github.com/tomsilver/pddlgym # Install most recent PDDLGym (must be from source!)
!pip install tabulate

In [None]:
from pddlgym.rendering import sar_render_from_string_grid
from collections import defaultdict
from tabulate import tabulate
from scipy.stats import binom
import abc
import numpy as np
import itertools
import functools

## Markov Decision Processes (MDPs)

In [None]:
class MDP:
    """An environment API that exposes transition probabilities,
    for use with stochastic planners like value iteration
    """
    @abc.abstractmethod
    def get_all_states(self):
        """Return a list of all possible states of the environment.

        We're restricted to small environments because we need
        this function.

        Returns
        -------
        states : [ hashable ]
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_all_actions(self):
        """Return a list of all possible actions of the environment.

        Returns
        -------
        actions : [ hashable ]
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_transition_probabilities(self, state, action):
        """Return a dictionary of next_states to probabilities.

        Returns
        -------
        next_states : { hashable : float }
            Maps next state to prob. Sums to 1.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_reward(self, state, action, next_state):
        """Return (deterministic) reward for executing action
        in state.

        Returns
        -------
        reward : float
            Single time step reward.
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def get_initial_states(self):
        """Designate certain states as initial states. Not
        always part of the standard MDP formalism.

        Returns
        -------
        states : [ hashable ]
        """
        raise NotImplementedError("Override me")

    @abc.abstractmethod
    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states. Not
        always part of the standard MDP formalism.

        Returns
        -------
        is_terminal : bool
        """
        raise NotImplementedError("Override me")

### Debug 1D Grid MDP

In [None]:
class Debug1DGridMDP(MDP):
    """A 1D grid MDP for debugging. The grid is 1x5
    and the agent is meant to start off in the middle.
    There is +10 reward on the rightmost square, -10 on
    the left. Actions are left and right. An action effect
    is reversed with 10% probability.
    """
    def get_all_states(self):
        return [0, 1, 2, 3, 4]

    def get_all_actions(self):
        return [-1, 1] # left, right

    def get_transition_probabilities(self, state, action):
        assert state in self.get_all_states()
        assert action in self.get_all_actions()
        intended_effect = min(max(state + action, 0), 4)
        opposite_effect = min(max(state - action, 0), 4)
        assert (intended_effect != opposite_effect)
        return {intended_effect : 0.9, opposite_effect : 0.1}

    def get_reward(self, state, action, next_state):
        if next_state == 0:
            return -10
        if next_state == 4:
            return 10
        return -1 # living penalty

    def get_initial_states(self):
        return [2]

    def state_is_terminal(self, state):
        return state in [0, 4]

### Search and Rescue MDPs

In [None]:
# General utility functions
def get_all_reachable_states(initial_states, actions, T):
    """Utility for deriving all states that are reachable
    from the initial states with >0 probability
    """
    reachable_states = set(initial_states)
    queue = [s for s in initial_states]
    while len(queue) > 0:
        state = queue.pop()
        for action in actions:
            for next_state, prob in T(state, action).items():
                if prob > 0 and next_state not in reachable_states | set(queue):
                    queue.append(next_state)
                    reachable_states.add(next_state)
    return reachable_states

def combine_factored_distributions(factored_dists):
    """Utility for creating a distribution of states from factored dists
    """
    keys = list(factored_dists.keys())
    choices = [list(factored_dists[k].items()) for k in keys]
    distribution = {}
    for choice in itertools.product(*choices):
        state = frozenset([(key, arg[0]) for key, arg in zip(keys, choice)])
        prob = np.exp(sum(np.log([arg[1] for arg in choice])))
        distribution[state] = prob
    assert abs(1. - sum(distribution.values())) < 1e-6
    return distribution


class SARMDP(MDP):
    """Parent class for search and rescue MDPs.
    """
    UP, DOWN, LEFT, RIGHT = range(4)

    @abc.abstractmethod
    def _loc_on_fire(self, loc, state):
        raise NotImplementedError("Override me")

    def get_all_states(self):
        # This is a fairly generic way to derive all states
        return get_all_reachable_states(
            self.get_initial_states(),
            self.get_all_actions(),
            self.get_transition_probabilities,
        )

    def get_all_actions(self):
        return [self.UP, self.DOWN, self.LEFT, self.RIGHT]

    def _allowed_robot_loc(self, r, c):
        """Helper for transitions
        """
        if (r, c) in self.WALL_LOCS:
            return False
        return 0 <= r < self.HEIGHT and 0 <= c < self.WIDTH

    def get_reward(self, state, action, next_state):
        next_state = dict(next_state)
        # Reached person
        if next_state['robot_loc'] == next_state['person_loc']:
            return self.RESCUE_REWARD
        # In fire
        if self._loc_on_fire(next_state['robot_loc'], next_state):
            return self.FIRE_REWARD
        return self.LIVING_REWARD

    def state_is_terminal(self, state):
        state = dict(state) # make easy to query
        # State is terminal if the person is at the robot
        if state['robot_loc'] == state['person_loc']:
            return True
        # State is terminal if robot is in fire
        if self._loc_on_fire(state['robot_loc'], state):
            return True
        return False

    def render(self, state, mode='string'):
        state = dict(state)
        grid = np.full((self.HEIGHT, self.WIDTH), 'open', dtype=object)
        for r, c in self.WALL_LOCS:
            grid[r, c] = 'wall'
        for r in range(self.HEIGHT):
            for c in range(self.WIDTH):
                if self._loc_on_fire((r, c), state):
                    grid[r, c] = 'fire'
        r, c = state['robot_loc']
        grid[r, c] = 'robot'
        r, c = state['person_loc']
        grid[r, c] = 'person'
        if mode == 'string':
            for name in ['open', 'wall', 'fire', 'robot', 'person']:
                grid[grid == name] = name[0].upper()
            return '\n'.join([''.join(row) for row in grid])
        assert mode == 'rgb'
        return sar_render_from_string_grid(grid)


class SARStochasticFiresMDP(SARMDP):
    """An MDP for search and rescue, with fires that appear
    and disappear stochastically, and a person who moves around
    stochastically in a middle "room". The goal is to rescue
    the person.

    The layout is:

        OOOROOO
        OWWWWWO
        OFOPOFO
        OWWWWWO

    where O is empty space, W is wall, F is fire (that may appear
    or disappear at that spot stochastically), and P is person (that
    may move stochastically between the three spaces in the middle)
    and R is the robot.

    The probability that a fire switches from on to off is 0.05.
    The probability that the person moves to an adjacent cell among
    the 3 is 0.25.

    Actions are up/down/left/right. Fires are sink states.

    Living penalty is -1. Reward for rescue is +100. Penalty for fire
    is -100 (and also fires terminate the episode.)
    """
    HEIGHT, WIDTH = 4, 7
    WALL_LOCS = [(1, c) for c in range(1, WIDTH-1)] + \
                [(3, c) for c in range(1, WIDTH-1)]
    FIRE_LEFT_LOC = (2, 1)
    FIRE_RIGHT_LOC = (2, WIDTH-2)
    PERSON_LOCS = [(2, 2), (2, 3), (2, 4)]
    RESCUE_REWARD = 100
    FIRE_REWARD = -100
    LIVING_REWARD = -1

    @functools.lru_cache(maxsize=None)
    def get_transition_probabilities(self, state, action):
        state = dict(state)
        # Get the next robot location
        r, c = state['robot_loc']
        dr, dc = {
            self.UP : (-1, 0),
            self.DOWN : (1, 0),
            self.LEFT : (0, -1),
            self.RIGHT : (0, 1),
        }[action]
        if self._allowed_robot_loc(r + dr, c + dc):
            next_robot_loc = (r + dr, c + dc)
        else:
            next_robot_loc = (r, c)
        next_robot_loc_distribution = { next_robot_loc : 1.0 }
        # Get the next person locations
        r, c = state['person_loc']
        next_person_loc_distribution = { (r, c) : 0.75 }
        adjacent_locs = []
        if (r, c-1) in self.PERSON_LOCS:
            adjacent_locs.append((r, c-1))
        if (r, c+1) in self.PERSON_LOCS:
            adjacent_locs.append((r, c+1))
        assert len(adjacent_locs) > 0
        for loc in adjacent_locs:
            next_person_loc_distribution[loc] = 0.25/len(adjacent_locs)
        # Get the next fire probs
        next_fire_left_distribution = { 
            state['fire_left_on'] : 0.95,
            not state['fire_left_on'] : 0.05,
        }
        next_fire_right_distribution = { 
            state['fire_right_on'] : 0.95,
            not state['fire_right_on'] : 0.05,
        }
        # Combine factored distributions
        return combine_factored_distributions({
            'robot_loc' : next_robot_loc_distribution,
            'person_loc' : next_person_loc_distribution,
            'fire_left_on' : next_fire_left_distribution,
            'fire_right_on' : next_fire_right_distribution,
        })

    def _loc_on_fire(self, loc, state):
        if state['fire_left_on'] and loc == self.FIRE_LEFT_LOC:
            return True
        if state['fire_right_on'] and loc == self.FIRE_RIGHT_LOC:
            return True
        return False

    def get_initial_states(self):
        return [frozenset([
            ('robot_loc', (0, 3)),
            ('person_loc', (2, 3)),
            ('fire_left_on', True),
            ('fire_right_on', True),
        ])]


class SARStochasticMovementMDP(SARMDP):
    """An MDP for search and rescue, with stochastic action effects.

    The layout is:

        OFFFO
        ROOOP
        OWWWO
        OWWWO
        OWWWO
        OOOOO

    where O is empty space, W is wall, F is fire, P is person,
    and R is the robot.

    An action has the intended effect with 0.9 probability. With
    0.1 probability, a random different action is taken.

    Actions are up/down/left/right. Fires are sink states.

    Reward for rescue is +100. Penalty for fire
    is -100 (and also fires terminate the episode.)

    The living penalty depends on the subclass.
    """
    HEIGHT, WIDTH = 6, 5
    WALL_LOCS = [(2, c) for c in range(1, WIDTH-1)] + \
                [(3, c) for c in range(1, WIDTH-1)] + \
                [(4, c) for c in range(1, WIDTH-1)]
    FIRE_LOCS = [(0, c) for c in range(1, WIDTH-1)]
    PERSON_LOC = (1, WIDTH-1)
    RESCUE_REWARD = 100
    FIRE_REWARD = -100

    @property
    @abc.abstractmethod
    def LIVING_REWARD(self):
        raise NotImplementedError("Override me")

    @functools.lru_cache(maxsize=None)
    def get_transition_probabilities(self, state, action):
        state = dict(state)
        # Get the next robot location
        r, c = state['robot_loc']
        intended_loc = None
        unintended_locs = []
        for a in self.get_all_actions():
            dr, dc = {
                self.UP : (-1, 0),
                self.DOWN : (1, 0),
                self.LEFT : (0, -1),
                self.RIGHT : (0, 1),
            }[a]
            if self._allowed_robot_loc(r + dr, c + dc):
                next_robot_loc = (r + dr, c + dc)
            else:
                next_robot_loc = (r, c)
            if a == action:
                intended_loc = next_robot_loc
            else:
                unintended_locs.append(next_robot_loc)
        next_robot_loc_distribution = defaultdict(float)
        next_robot_loc_distribution[intended_loc] = 0.9
        for loc in unintended_locs:
            next_robot_loc_distribution[loc] += 0.1/len(unintended_locs)
        distribution = {}
        for robot_loc, prob in next_robot_loc_distribution.items():
            next_state = state.copy()
            next_state['robot_loc'] = robot_loc
            distribution[frozenset(next_state.items())] = prob
        return distribution

    def get_initial_states(self):
        return [frozenset([
            ('robot_loc', (1, 0)),
            ('person_loc', self.PERSON_LOC),
        ])]

    def _loc_on_fire(self, loc, state):
        return loc in self.FIRE_LOCS


class ChillSARStochasticMovementMDP(SARStochasticMovementMDP):

    @property
    def LIVING_REWARD(self):
        return -1

class UrgentSARStochasticMovementMDP(SARStochasticMovementMDP):

    @property
    def LIVING_REWARD(self):
        return -10

### Rental Car MDP

This one is for you to complete! But we recommend finishing the rest of the assignment first.

In [None]:
class RentalCarMDP(MDP):
    """MDP for the rental car problem that you formulated in
    the written section.
    """
    TOTAL_NUM_CARS = 3
    MAX_DEMAND = 2
    CITY1_RENTAL_PROBS = [0.2, 0.5, 0.3] # probability that each number
                                         # of cars is demanded in city 1
    CITY2_RENTAL_PROBS = [0.2, 0.4, 0.4] # probability that each number
                                         # of cars is demanded in city 2
    CITY1_TO_CITY2_PROB = 0.10 # the probability that a person renting a car
                               # in city 1 wants to drive to city 2
    CITY2_TO_CITY1_PROB = 0.25 # the probability that a person renting a car
                               # in city 2 wants to drive to city 1
    RENTAL_REWARD = 1.

    def get_all_states(self):
        """Return a list of all possible states of the environment.

        We're restricted to small environments because we need
        this function.

        Returns
        -------
        states : [ hashable ]
        """
        raise NotImplementedError("Implement me!")

    def get_all_actions(self):
        """Return a list of all possible actions of the environment.

        For simplicity, assume all actions are applicable in all states.

        Returns
        -------
        actions : [ hashable ]
        """
        raise NotImplementedError("Implement me!")

    @functools.lru_cache(maxsize=None)
    def get_transition_probabilities(self, state, action):
        """Return a dictionary of next_states to probabilities.

        Returns
        -------
        next_states : { hashable : float }
            Maps next state to prob. Sums to 1.
        """
        raise NotImplementedError("Implement me!")

    def get_reward(self, state, action, next_state):
        """Return (deterministic) reward for executing action in state.

        Returns
        -------
        reward : float
            Single time step reward.
        """
        raise NotImplementedError("Implement me!")

    def get_initial_states(self):
        """Designate certain states as initial states. Not
        always part of the standard MDP formalism.

        Returns
        -------
        states : [ hashable ]
        """
        raise NotImplementedError("Implement me!")
        
    def state_is_terminal(self, state):
        """Designate certain states as terminal (done) states. Not
        always part of the standard MDP formalism.

        Returns
        -------
        is_terminal : bool
        """
        return False

## Value Iteration

In [None]:
def value_iteration(mdp, max_num_iterations=1000, change_threshold=1e-4,
                    gamma=0.99, print_every=None):
    """Run value iteration for a certain number of iterations or until
    the max change between iterations is below a threshold.

    Gamma is the temporal discount factor.

    Returns
    -------
    Q : { hashable : { hashable : float } }
        Q[state][action] = action-value.
    """
    raise NotImplementedError("Implement me!")

## Evaluation Pipeline

In [None]:
def create_policy_from_Q(Q, rng=np.random):
    """Create a policy from action-values Q

    If we are sure that there are no ties, then this function could be
    one line: lambda s : max(Q[s], key=Q[s].get)

    But we want to randomly sample to break ties.

    Parameters
    ----------
    Q : { hashable : { hashable : float } }
        Q[state][action] = action-value.
    
    Returns
    -------
    policy : fn: state -> action
    """
    # Create exploit policy from Q
    def policy(s):
        best_actions = set()
        best_action_value = -np.inf
        for a, val in Q[s].items():
            if val > best_action_value:
                best_action_value = val
                best_actions = { a }
            elif val == best_action_value:
                best_actions.add(a)
        if len(best_actions) == 1:
            return next(iter(best_actions))
        # Break ties randomly
        best_actions = sorted(best_actions)
        rng.shuffle(best_actions)
        return best_actions[0]
    return policy


def sample_from_dict(dict_probs, rng):
    """Helper for test_action_values
    """
    assert abs(sum(dict_probs.values()) - 1.) < 1e-6, \
        "Probabilities do not sum to 1."
    choices, probs = zip(*dict_probs.items())
    choice_idx = rng.choice(len(choices), p=probs)
    return choices[choice_idx]

def test_action_values(Q, mdp, num_trials=500, max_trial_length=100, 
                       rng=np.random, render=False, render_mode='rgb'):
    """Run the policy derived from Q in the given mdp for
    a certain number of trials. Calculate the returns.

    Returns
    -------
    returns : [ float ]
        One per trial.
    """
    # Extract things from mdp
    initial_states = mdp.get_initial_states()
    assert not any(mdp.state_is_terminal(s) for s in initial_states), \
        "Cannot have overlap between initial and terminal states!"
    T = mdp.get_transition_probabilities
    R = mdp.get_reward

    # Create policy
    policy = create_policy_from_Q(Q, rng=rng)

    # Start the evaluation
    returns = []
    images = []
    for _ in range(num_trials):
        # Sample an initial state
        state = initial_states[rng.choice(len(initial_states))]
        trial_returns = 0.
        if render:
            images.append(mdp.render(state, mode=render_mode))
        for _ in range(max_trial_length):
            # Get action from policy
            action = policy(state)
            # Take action
            next_state = sample_from_dict(T(state, action), rng)
            if render:
                images.append(mdp.render(next_state, mode=render_mode))
            # Get reward
            reward = R(state, action, next_state)
            trial_returns += reward
            # Check if done
            if mdp.state_is_terminal(next_state):
                break
            # Update state
            state = next_state
        returns.append(trial_returns)

    if render:
        return returns, images
    return returns

### Rendering (for development/debugging)

In [None]:
def imshow(img):
    import cv2
    import IPython
    img = cv2.cvtColor(img , cv2.COLOR_RGB2BGR)
    _, ret = cv2.imencode('.jpg', img)
    i = IPython.display.Image(data=ret)
    IPython.display.display(i)

def run_demo_with_rendering(mdp, max_num_iterations=1000, print_every=10, render_mode='string',
                            seed=8):
    """Render mode options include "string" and "rgb"
    """
    rng = np.random.RandomState(seed)
    Q = value_iteration(mdp, max_num_iterations=max_num_iterations, print_every=print_every)
    returns, render_out = test_action_values(Q, mdp, num_trials=1, rng=rng,
        render=True, render_mode=render_mode)
    if render_mode == 'string':
        for s in render_out:
            print(s)
            print()
    else:
        for img in render_out:
            imshow(img)

In [None]:
# Render mode options include "string" and "rgb"
# run_demo_with_rendering(SARStochasticFiresMDP(), render_mode='string')
# run_demo_with_rendering(ChillSARStochasticMovementMDP(), render_mode='string')
# run_demo_with_rendering(UrgentSARStochasticMovementMDP(), render_mode='string')
run_demo_with_rendering(SARStochasticFiresMDP(), render_mode='rgb')
run_demo_with_rendering(ChillSARStochasticMovementMDP(), render_mode='rgb')
run_demo_with_rendering(UrgentSARStochasticMovementMDP(), render_mode='rgb')

### Main

In [None]:
def main():
    seed = 0
    rng = np.random.RandomState(seed)
    # Create MDPs
    mdps = {
        'Debug1DGrid' : Debug1DGridMDP(),
        'StochasticFires' : SARStochasticFiresMDP(),
        'ChillStochasticMovement' : ChillSARStochasticMovementMDP(),
        'UrgentStochasticMovement' : UrgentSARStochasticMovementMDP(),
        'RentalCar' : RentalCarMDP()
    }
    # Create action value estimators
    action_value_estimators = {
        'VI-0' : lambda mdp : value_iteration(mdp, max_num_iterations=0),
        'VI-1' : lambda mdp : value_iteration(mdp, max_num_iterations=1),
        'VI-10' : lambda mdp : value_iteration(mdp, max_num_iterations=10),
        'VI-100' : lambda mdp : value_iteration(mdp, max_num_iterations=100),
        'VI-1000' : lambda mdp : value_iteration(mdp, max_num_iterations=1000, print_every=10),
    }
    # Evaluate each estimator in each MDP
    all_returns = {}
    for mdp_name, mdp in mdps.items():
        all_returns[mdp_name] = {}
        for estimator_name, estimator in action_value_estimators.items():
            Q = estimator(mdp) # Run estimation! Main function
            returns = test_action_values(Q, mdp, rng=rng)
            all_returns[mdp_name][estimator_name] = returns
    # Tabulate and print results
    for mdp_name in sorted(all_returns):
        print(f"## {mdp_name} ##")
        headers = ["Estimator", "Mean Returns"]
        table = [(estimator_name, np.mean(all_returns[mdp_name][estimator_name])) \
                 for estimator_name in all_returns[mdp_name]]
        print(tabulate(table, headers=headers))

In [None]:
main()