# 6.882 HW 2.1 Starter Code

See the problem set handout for instructions and deliverables.

See HW1.1 Starter Code for dependency installation instructions.

In [None]:
# Install dependencies (run this once ever 12 hours)
!pip install --upgrade git+https://github.com/tomsilver/pddlgym # Install most recent PDDLGym (must be from source!)
!pip install tabulate

In [None]:
import abc
import pddlgym
import heapq as hq
import numpy as np
import time
from itertools import count
from collections import defaultdict, namedtuple
from tabulate import tabulate

## A Generic Approach for PO Environments

An agent maintains a belief state and produces actions.

In [None]:
class PartialObservabilityApproach:
    """An agent that maintains a belief state (set of possible states)
    as it takes actions and receives partial observations.

    Parameters
    ----------
    actions : [ int ]
        A list of actions that the agent can take. All actions are
        applicable in all states.
    successor_fn : state, action -> state
        Maps an environment state and action to a next state.
    check_goal_fn : state -> bool
        Maps an environment state to true when the goal is reached.
    observation_fn : state -> observation
        Maps an environment state to an observation. Sometimes
        called "Percept".
    observation_to_states_fn : observation -> frozenset{states}
        Maps an observation to the set of environment states such
        that observation_fn(state) would produce that observation.
    """
    def __init__(self, actions, successor_fn, check_goal_fn, 
                 observation_fn, observation_to_states_fn):
        self._actions = actions
        self._successor_fn = successor_fn
        self._check_goal_fn = check_goal_fn
        self._observation_fn = observation_fn
        self._observation_to_states_fn = observation_to_states_fn
        self._step_count = 0
        self._belief_state = None # set after reset
        self._rng = None

    def reset(self, obs):
        """Tell the agent that we have started a new problem with
        initial observation "obs".

        Parameters
        ----------
        obs : hashable
            The initial observation

        Returns
        -------
        info : dict
            Any useful debug info.
        """
        # Reset the belief state
        self._belief_state = self._observation_to_states_fn(obs)
        # Reset step count
        self._step_count = 0
        return {}

    def step(self, obs):
        """Receive an observation and produce an action to be
        immediately executed in the environment.

        Parameters
        ----------
        obs : hashable
            The observation

        Returns
        -------
        action : int
            The action is assumed to be immediately taken.
        info : dict
            Any useful debug info.
        """
        # Update the belief state based on the observation
        possible_states = self._observation_to_states_fn(obs)
        # This is set intersection
        self._belief_state &= possible_states
        # Find an action
        action, info = self._get_action()
        # Update step count
        self._step_count += 1
        # Update the belief state based on action
        self._belief_state = self._predict_belief_state(self._belief_state, 
            action)
        return action, info

    def seed(self, seed):
        """Seed the agent, just in case it's random
        """
        self._rng = np.random.RandomState(seed)

    @abc.abstractmethod
    def _get_action(self):
        """Return an action to be immediately taken, based on the current
        belief state (self._belief_state). This is the main thing that
        differentiates subclasses.

        Returns
        -------
        action : int
            The action to be taken immediately.
        info : dict
            Any useful debug info.
        """
        raise NotImplementedError("Override me")

    def _check_belief_state_goal(self, belief_state):
        """Check whether the belief state is a goal, that is, whether
        all states in the belief state satisify the check_goal_fn.

        This function is included here because it is likely to be
        used by subclasses.

        Parameters
        ----------
        belief_state : frozenset{hashable}

        Returns
        -------
        goal_reached : bool
        """
        # We've found a goal if all states in the belief state are at goals
        for state in belief_state:
            if not self._check_goal_fn(state):
                return False
        return True

    def _predict_belief_state(self, belief_state, action):
        """Get the next belief state that would result after taking
        action in belief_state.

        This function is included here because it is likely to be
        used by subclasses.

        Parameters
        ----------
        belief_state : frozenset{hashable}
        action : int

        Returns
        -------
        next_belief_state : frozenset{hashable}
        """
        next_belief_state = set()
        for state in belief_state:
            next_state = self._successor_fn(state, action)
            next_belief_state.add(next_state)
        return frozenset(next_belief_state)

### Random Actions Approach

In [None]:
class RandomActions(PartialObservabilityApproach):
    """Take random actions
    """
    def _get_action(self):
        return self._rng.choice(self._actions), {}

### Depth-first And-Or Search
Finish implementing this class.

In [None]:
class AndOrSearch(PartialObservabilityApproach):
    """Exhaustive depth-first And-Or search
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._conditional_plan = "failure"

    def _get_action(self):
        """Return an action to be immediately taken, based on the current
        belief state (self._belief_state).

        Returns
        -------
        action : int
            The action to be taken immediately.
        info : dict
            Any useful debug info.
        """
        # If self._step_count == 0, get a new plan from self._get_conditional_plan()
        # Execute the next action in the conditional plan
        raise NotImplementedError("Implement me!")

    def _get_conditional_plan(self):
        """Run planning from scratch, given the current belief state
        """
        # Start off the AND-OR search
        return self._run_or_search(self._belief_state, [])

    def _run_or_search(self, belief_state, path, depth=0, max_depth=np.inf):
        """Run OR part of AO search (recursively).

        Parameters
        ----------
        belief_state : frozenset{hashable}
        path : [ belief_state ]
            Belief states encountered so far, used to find cycles.
        depth : int
        max_depth : int

        Returns
        -------
        conditional_plan : Any
            Representation of the conditional plan.
        """
        raise NotImplementedError("Implement me!")

    def _run_and_search(self, belief_states, path, depth=0, max_depth=np.inf):
        """Run AND part of the AO search (recursively).

        Parameters
        ----------
        belief_states : [frozenset{hashabale}]
            A list of belief states.
        path : [ belief_state ]
            Belief states encountered so far, used to find cycles.
        depth : int
        max_depth : int

        Returns
        -------
        conditional_plan : Any
            Representation of the conditional plan.
        """
        raise NotImplementedError("Implement me!")

### Iterative Deepening And-Or Search
No need to modify this (but you can if you want to).

In [None]:
class IterativeDeepeningAndOrSearch(AndOrSearch):
    """Run AndOrSearch with progressively larger depth limits until a plan is found.
    """
    def _get_conditional_plan(self):
        """Run planning from scratch, given the current belief state
        """
        # Run iterative deepening planning until plan is not a failure
        for max_depth in count(1):
            print(f"Running iterative deepening with depth {max_depth}", end='\r', flush=True)
            conditional_plan = self._run_or_search(self._belief_state, [], 
                depth=0, max_depth=max_depth)
            if conditional_plan != "failure":
                print()
                break
        return conditional_plan

### AStar Planner
Used by Single State Determinization. No need to modify this (but you can if you want to).

In [None]:
class AStar:
    """Planning with A* search. Used by SingleStateDeterminization.
    """
    
    Node = namedtuple("Node", ["state", "parent", "action", "g"])

    def __init__(self, successor_fn, check_goal_fn, heuristic=None, timeout=100):
        self._get_successor_state = successor_fn
        self._check_goal = check_goal_fn
        self._heuristic = heuristic or (lambda s : 0)
        self._timeout = timeout
        self._actions = None
        
    def __call__(self, state, verbose=True):
        return self._get_plan(state, verbose=verbose)

    def set_actions(self, actions):
        self._actions = actions

    def _get_plan(self, state, verbose=True):
        start_time = time.time()
        queue = []
        state_to_best_g = defaultdict(lambda : float("inf"))
        tiebreak = count()

        root_node = self.Node(state=state, parent=None, action=None, g=0)
        hq.heappush(queue, (self._get_priority(root_node), next(tiebreak), root_node))
        num_expansions = 0

        while len(queue) > 0 and (time.time() - start_time < self._timeout):
            _, _, node = hq.heappop(queue)
            # If we already found a better path here, don't bother
            if state_to_best_g[node.state] < node.g:
                continue
            # If the goal holds, return
            if self._check_goal(node.state):
                if verbose:
                    print("\nPlan found!")
                return self._finish_plan(node), {'node_expansions' : num_expansions}
            num_expansions += 1
            if verbose:
                print(f"Expanding node {num_expansions}", end='\r', flush=True)
            # Generate successors
            for action, child_state in self._get_successors(node.state):
                # If we already found a better path to child, don't bother
                if state_to_best_g[child_state] <= node.g+1:
                    continue
                # Add new node
                child_node = self.Node(state=child_state, parent=node, action=action, g=node.g+1)
                priority = self._get_priority(child_node)
                hq.heappush(queue, (priority, next(tiebreak), child_node))
                state_to_best_g[child_state] = child_node.g

        if verbose:
            print("Warning: planning failed.")
        return [], {'node_expansions' : num_expansions}
    
    def _get_successors(self, state):
        for action in self._actions:
            next_state = self._get_successor_state(state, action)
            yield action, next_state

    def _finish_plan(self, node):
        plan = []
        while node.parent is not None:
            plan.append(node.action)
            node = node.parent
        plan.reverse()
        return plan

    def _get_priority(self, node):
        h = self._heuristic(node)
        if isinstance(h, tuple):
            return (tuple(node.g + hi for hi in h), h)
        return (node.g + h, h)

### Single State Determinization
Finish implementing this class. (This should be relatively short.)

In [None]:
class SingleStateDeterminization(PartialObservabilityApproach):
    """Arbitrarily select a state from the belief state and then
    do uniform cost search.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._plan = []
        self._planner = AStar(self._successor_fn, self._check_goal_fn)
        self._planner.set_actions(self._actions)

    def _get_action(self):
        """Return an action to be immediately taken, based on the current
        belief state (self._belief_state).

        Returns
        -------
        action : int
            The action to be taken immediately.
        info : dict
            Any useful debug info.
        """
        raise NotImplementedError("Implement me!")

### UCT Planner
Used by PO-UCT. No need to modify this (but you can if you want to).

In [None]:
class UCT:
    """Implementation of UCT based on Leslie's lecture notes. Used by POUCT.
    """
    def __init__(self, actions, reward_fn, transition_fn, done_fn=None, num_search_iters=100, gamma=0.9):
        self._actions = actions
        self._reward_fn = reward_fn
        self._transition_fn = transition_fn
        self._done_fn = done_fn or (lambda s,a : False)
        self._num_search_iters = num_search_iters
        self._gamma = gamma
        self._rng = None # set in seed
        self._Q = None
        self._N = None
        self._node_expansions = 0

    def run(self, state, horizon=100):
        # Initialize Q[s][a][d] -> float
        self._Q = defaultdict(lambda : defaultdict(lambda : defaultdict(float)))
        # Initialize N[s][a][d] -> int
        self._N = defaultdict(lambda : defaultdict(lambda : defaultdict(int)))
        # Loop search
        for it in range(self._num_search_iters):
            # Update Q
            self._search(state, 0, horizon=horizon)
        info = {"node_expansions" : self._node_expansions}
        self._node_expansions = 0
        return info

    def get_action(self, state, t=0):
        # Return best action, break ties randomly
        return max(self._actions, key=lambda a : (self._Q[state][a][t], self._rng.uniform()))

    def _search(self, s, depth, horizon=100):
        # Base case
        if depth == horizon:
            return 0.
        # Select an action, balancing explore/exploit
        a = self._select_action(s, depth, horizon=horizon)
        # Create a child state
        next_state = self._transition_fn(s, a)
        self._node_expansions += 1
        # Get value estimate
        if self._done_fn(s, a):
            # Some environments terminate problems before the horizon 
            q = self._reward_fn(s, a)
        else:
            q = self._reward_fn(s, a) + self._gamma * self._search(next_state, depth+1, horizon=horizon)
        # Update values and counts
        num_visits = self._N[s][a][depth] # before now
        # First visit to (s, a, depth)
        if num_visits == 0:
            self._Q[s][a][depth] = q
        # We've been here before
        else:
            # Running average
            self._Q[s][a][depth] = (num_visits / (num_visits + 1.)) * self._Q[s][a][depth] + \
                                   (1 / (num_visits + 1.)) * q
        # Update num visits
        self._N[s][a][depth] += 1
        return self._Q[s][a][depth]

    def _select_action(self, s, depth, horizon):
        # If there is any action where N(s, a, depth) == 0, try it first
        untried_actions = [a for a in self._actions if self._N[s][a][depth] == 0]
        if len(untried_actions) > 0:
            return self._rng.choice(untried_actions)
        # Otherwise, take an action to trade off exploration and exploitation
        N_s_d = sum(self._N[s][a][depth] for a in self._actions)
        best_action_score = -np.inf
        best_actions = []
        for a in self._actions:
            explore_bonus = (np.log(N_s_d) / self._N[s][a][depth])**((horizon + depth) / (2*horizon + depth))
            score = self._Q[s][a][depth] + explore_bonus
            if score > best_action_score:
                best_action_score = score
                best_actions = [a]
            elif score == best_action_score:
                best_actions.append(a)
        return self._rng.choice(best_actions)

    def seed(self, seed):
        self._rng = np.random.RandomState(seed)

### PO-UCT
Finish implementing this class.

In [None]:
class POUCT(PartialObservabilityApproach):
    """Use UCT in belief space; sample belief state transitions uniformly at random
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._planner = UCT(self._actions, self._get_uct_reward, self._get_uct_transition,
                            done_fn=lambda s,a:self._check_belief_state_goal(s),
                            num_search_iters=100, gamma=0.9)
        self._steps_since_replanning = 0
        self._replanning_interval = 1
        self._horizon = 50

    def _get_action(self):
        """Return an action to be immediately taken, based on the current
        belief state (self._belief_state).

        Returns
        -------
        action : int
            The action to be taken immediately.
        info : dict
            Any useful debug info.
        """
        info = {}
        # Replan on a fixed interval
        if self._step_count % self._replanning_interval == 0:
            info = self._planner.run(self._belief_state, horizon=self._horizon)
            self._steps_since_replanning = 0
        action = self._planner.get_action(self._belief_state, t=self._steps_since_replanning)
        self._steps_since_replanning += 1
        return action, info

    def _get_plan(self):
        """Determinize and plan

        Returns
        -------
        plan : [ int ]
            A sequence of actions.
        """
        self._planner.run(self._belief_state)
        return self._planner.get_action(self._belief_state)

    def _get_uct_reward(self, belief_state, _):
        """Use a sparse reward: 1.0 if the goal is reached, 0 otherwise

        Parameters
        ----------
        belief_state : frozenset{hashable}

        Returns
        -------
        reward : float
        """
        raise NotImplementedError("Implement me!")

    def _get_uct_transition(self, belief_state, action):
        """Sample uniformly at random among the possible next belief states
        """
        raise NotImplementedError("Implement me!")

    def seed(self, seed):
        """Also seed the planner
        """
        super().seed(seed)
        self._planner.seed(seed)

## Evaluation Pipeline
No need to modify this (but you can if you want to).

In [None]:
def run_single_test(test_env, problem_idx, model, max_horizon=100):
    print(f"Running test problem {problem_idx}")
    test_env.fix_problem_index(problem_idx)
    start_time = time.time()
    obs, info = test_env.reset()
    model_info = model.reset(obs)
    num_steps = 0
    expansions = model_info.get("node_expansions", 0)
    success = False
    for t in range(max_horizon):
        print(".", end='', flush=True)
        act, model_info = model.step(obs)
        expansions += model_info.get("node_expansions", 0)
        obs, reward, done, info = test_env.step(act)
        num_steps += 1
        if done:
            assert reward == 1
            success = True
            break
    duration = time.time() - start_time
    print(f" final duration: {duration} with num steps {num_steps} and success={success}.")
    return duration, expansions, num_steps, success

def run_single_experiment(model, env, seed=0):
    # Initialize
    model.seed(seed)
    env.seed(seed)

    # Do testing
    test_durations = [] # seconds, one per problem
    test_expansions = [] # integers
    test_num_steps = [] # integers
    test_successes = [] # boolean, True if successful
    for problem_idx in range(len(env.problems)):
        duration, expansions, num_steps, success = \
            run_single_test(env, problem_idx, model)
        test_durations.append(duration)
        test_expansions.append(expansions)
        test_num_steps.append(num_steps)
        test_successes.append(success)

    env.close()

    return test_durations, test_expansions, test_num_steps, test_successes

def get_approach(name, env, planning_timeout=10):
    if name == "random":
        return RandomActions(env.get_possible_actions(), env.get_successor_state, 
                             env.check_goal, env.get_observation, env.observation_to_states)

    if name == "depth_first_and_or_search":
        return AndOrSearch(env.get_possible_actions(), env.get_successor_state, 
                           env.check_goal, env.get_observation, env.observation_to_states)

    if name == "iterative_deepening_and_or_search":
        return IterativeDeepeningAndOrSearch(env.get_possible_actions(), env.get_successor_state, 
                                             env.check_goal, env.get_observation, env.observation_to_states)

    if name == "single_state_determinization":
        return SingleStateDeterminization(env.get_possible_actions(), env.get_successor_state, 
                                          env.check_goal, env.get_observation, env.observation_to_states)

    if name == "pouct":
        return POUCT(env.get_possible_actions(), env.get_successor_state, 
                     env.check_goal, env.get_observation, env.observation_to_states)


    raise Exception(f"Unrecognized approach: {name}")

def print_results_table(env_name, results_for_env):
    print(f"\n### {env_name} ###")
    mean_table = [(a, ) + tuple(np.mean(results_for_env[a], axis=0)) \
                  for a in sorted(results_for_env)]
    columns = ["Approach", "Duration", "Expansions", "Num Steps", "Successes"]
    print(tabulate(mean_table, headers=columns))

def main():
    approaches = [
        "random", 
        "depth_first_and_or_search",
        "iterative_deepening_and_or_search",
        "single_state_determinization",
        "pouct",
    ]
    num_seeds_per_approach = {
        "random" : 10,
        "depth_first_and_or_search" : 1,
        "iterative_deepening_and_or_search" : 1,
        "single_state_determinization" : 10,
        "pouct" : 10,
    }

    env_names = [
        "SmallPOSARRadius0",
        "POSARRadius1",
        "POSARRadius0",
        "POSARRadius1Xray", 
        "POSARRadius0Xray",
    ]

    all_results = {}
    for env_name in env_names:
        results_for_env = {}
        all_results[env_name] = results_for_env
        for approach in approaches:
            results_for_env[approach] = []
            env = pddlgym.make(f"{env_name}-v0")
            model = get_approach(approach, env)
            for seed in range(num_seeds_per_approach[approach]):
                results = run_single_experiment(model, env, seed=seed)
                for (dur, expansions, num_steps, succ) in zip(*results):
                    results_for_env[approach].append((dur, expansions, num_steps, succ))

        # Print per-environment results (because of impatience)
        print_results_table(env_name, results_for_env)

    # Print final results
    print("\n" + "*" * 80)
    for env_name in env_names:
        print_results_table(env_name, all_results[env_name])


### Fire Away!

In [None]:
main()