# Custom MDP specification

In the following code, we show how to define a custom MDP with a custom transition probability matrix, reward distributions and starting state distribution.

In [1]:
import numpy as np
from scipy.stats import beta

from colosseum.mdp.custom_mdp import CustomEpisodic, CustomContinuous
from colosseum.utils.loops import human_loop

In [2]:
num_states = 4
num_actions = 2

T = np.zeros((num_states, num_actions, num_states), dtype=np.float32)
T[0, 0, 1] = 1.0
T[0, 1, 2] = 1.0

T[1, 0, 2] = T[1, 0, 3] = 0.5
T[1, 1, 2] = T[1, 1, 3] = 0.1
T[1, 1, 1] = 0.8

T[2, 0, 1] = T[2, 0, 3] = 0.5
T[2, 1, 1] = T[2, 1, 3] = 0.1
T[2, 1, 2] = 0.8

T[3, 0, 0] = 0.5
T[3, 0, 1] = T[3, 0, 2] = 0.25
T[3, 1, 0] = 0.1
T[3, 1, 1] = T[3, 1, 2] = 0.1
T[3, 1, 3] = 0.7

np.random.seed(42)
# You can define the reward distribution as a dictionary
R = {
    (s, a): beta(np.random.uniform(0, 30), np.random.uniform(0, 30))
    for s in range(num_states)
    for a in range(num_actions)
}
# or as a matrix whose entries correspond to the expected value of the reward
# R = np.random.rand(num_states, num_actions)

# The starting state distribution must be specified as a dictionary whose keys are the starting states
# and whose values are the corresponding probabilities.
T_0 = {0: 1.0}

# The episodic version
mdp = CustomEpisodic(
    seed=42,
    T_0=T_0,
    T=T,
    R=R,
)
human_loop(mdp)

# The continuous case
mdp = CustomContinuous(
    seed=42,
    T_0=T_0,
    T=T,
    R=R,
)
human_loop(mdp)

Start calculating the optimal policy
End calculating the optimal policy
State: TimeStep(step_type=<StepType.FIRST: 0>, reward=None, discount=None, observation=0)
[['H' '=' '0']
 ['_' '_' '_']
 [' ' ' ' 'X']
 ['A' ' ' 'X']]
The optimal action for this state is:1
State: TimeStep(step_type=<StepType.MID: 1>, reward=0.5501015186309814, discount=1.0, observation=2)
[['H' '=' '1']
 ['_' '_' '_']
 ['A' ' ' 'X']
 [' ' ' ' 'X']]
The optimal action for this state is:0
State: TimeStep(step_type=<StepType.MID: 1>, reward=0.45915117859840393, discount=1.0, observation=3)
[['H' '=' '2']
 ['_' '_' '_']
 [' ' ' ' 'X']
 [' ' 'A' 'X']]
The optimal action for this state is:0
State: TimeStep(step_type=<StepType.LAST: 2>, reward=0.796762228012085, discount=0.0, observation=-1)
State: TimeStep(step_type=<StepType.FIRST: 0>, reward=None, discount=None, observation=0)
[['H' '=' '0']
 ['_' '_' '_']
 [' ' ' ' 'X']
 ['A' ' ' 'X']]
The optimal action for this state is:1
Start calculating the optimal policy
End ca

TypeError: get_grid_representation() missing 1 required positional argument: 'h'