In [160]:
import copy
from dataclasses import dataclass
from collections import namedtuple, defaultdict
from msdm.core.mdp import TabularMarkovDecisionProcess
from msdm.core.pomdp import TabularPOMDP
from msdm.core.distributions import DictDistribution

State = namedtuple("State", "x y")
Action = namedtuple("Action", "dx dy open")
Observation = namedtuple("Observation", "x y")

class KeysAndDoors(TabularPOMDP):
    def __init__(
        self,
        coherence=.95,
        discount_rate=.95,
        step_cost=-1,
        target_reward=50,
        grid=None
    ):
        """
        Heaven or Hell (a.k.a. information gathering) as first described by
        [Bonet and Geffner (1998)](https://bonetblai.github.io/reports/fall98-pomdp.pdf).

        A simple POMDP where the agent must gather information to figure out
        which goal is gives a reward or punishment.

        Parameters
        ---------
        :coherence:       The strength of the signal about which side is heaven/hell
        :discount_rate:
        :step_cost:       Step cost when not reading
        :reward:
        :grid:            A multiline string representing a heaven/hell configuration.
                          `s` is the initial state,
                          `#` are walls,
                          't' is the target
                          'd' are closed doors
                          'o' are open doors
                          'l' are locked doors
        """
        if grid is None:
            grid = \
            """
            t..d.
            ##.##
            .....
            ##s..
            """
        grid = [list(r.strip()) for r in grid.split('\n') if len(r.strip()) > 0]
        self.grid = grid
        self.height = len(self.grid)
        self.width = len(self.grid[0])
        self.loc_features = {}
        self.features_loc = defaultdict(list)
        for y, row in enumerate(grid):
            for x, f in enumerate(row):
                if f == '.':
                    self.loc_features[(x, y)] = '.'
                else:
                    self.loc_features[(x, y)] = f
                self.features_loc[f].append((x, y))

        self.coherence = coherence
        self.discount_rate = discount_rate
        self.step_cost = step_cost
        self.target_reward = target_reward

        

    def initial_state_dist(self):
        x, y = self.features_loc['s'][0]
        return DictDistribution({
            State(x=x, y=y): 1.0,
        })

    def actions(self, s):
        return (
            Action(0, -1, False),
            Action(0, 1, False),
            Action(-1, 0, False),
            Action(1, 0, False),
            Action(0, 0, True),
        )

    def is_absorbing(self, s):
        loc = (s.x, s.y)
        return self.loc_features[loc] == 't'

    def next_state_dist(self, s, a):
        x, y = s.x, s.y
        nx, ny = (s.x + a.dx, s.y + a.dy)
        adjacent = []

        # Don't consider states outside of the grid
        if nx < 0 or nx >= self.width or ny < 0 or ny >= self.height:
            return DictDistribution({State(x=x, y=y): 1.0})

        # Check if agent is on an edge
        if (x-1 >= 0):
            adjacent.append((x-1, y))
        if (x+1 < self.width):
            adjacent.append((x+1, y))
        if (y-1 >= 0):
            adjacent.append((x, y-1))
        if (y+1 < self.height):
            adjacent.append((x, y+1))

        # Open Door
        if a.open:
            for adj in adjacent:
                # If a door is opened
                if self.loc_features.get(adj) == 'd':
                    adj_x, adj_y = adj
                    self.loc_features[adj] = 'o'
                    self.features_loc['d'].remove(adj)
                    self.features_loc['o'].append(adj)
                    self.grid[adj_y][adj_x] = 'o'

        # Handles movement for blocked spaces
        if self.loc_features.get((nx, ny), '#') == '#':
            nx, ny = (s.x, s.y)
        if self.loc_features.get((nx, ny), 'l') == 'l':
            nx, ny = (s.x, s.y)
        if self.loc_features.get((nx, ny), 'd') == 'd':
            nx, ny = (s.x, s.y)
        return DictDistribution({
            State(x=nx, y=ny): 1.0
        })

    def reward(self, s, a, ns):
        print(f"Checking reward for state: {ns}, features: {self.loc_features.get((ns.x, ns.y))}")
        r=0
        r += self.step_cost
        if self.loc_features[(ns.x, ns.y)] == 't':
            r += self.target_reward
        return r

    def observation_dist(self, a, ns):
        return DictDistribution({
                Observation(x=ns.x, y=ns.y): 1.0
        })

    def state_string(self, s):
        grid = copy.deepcopy(self.grid)
        for y, row in enumerate(grid):
            for x, f in enumerate(row):
                if (x, y) == (s.x, s.y):
                    grid[y][x] = '@'
        return '\n'.join([''.join(r) for r in grid])


In [161]:
from msdm.algorithms import  PointBasedValueIteration
hh = KeysAndDoors(
    coherence=.9,
    grid=
        """
        t..d.
        ##.##
        .....
        ##s..
        """,
    discount_rate=.9
)
pbvi_res = PointBasedValueIteration(
    min_belief_expansions=1,
    max_belief_expansions=20,
)

# Try to plan and print intermediate info
try:
    print("Starting planning process...")
    pbvi_res = PointBasedValueIteration(
        min_belief_expansions=1,
        max_belief_expansions=20
    ).plan_on(hh)
    print("Planning successful!")
except Exception as e:
    print(f"Error during planning: {type(e).__name__}: {str(e)}")
    

Starting planning process...
Error during planning: KeyError: State(x=3, y=0)


In [162]:
# pbvi_res.policy
traj = pbvi_res.policy.run_on(hh)
tuple(traj[0])
for t, step in enumerate(traj):
    sstr = hh.state_string(step.state)
    print(f"state {t}: \n", sstr, sep="")
    print(step.action)
    print(step.observation)
    print()

AttributeError: 'PointBasedValueIteration' object has no attribute 'policy'