# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Problem parameters


In [None]:
states = ["Tiger left","Tiger right"] # States
actions = ["Open left","Open right","Listen"] # Actions
observations = ["observe tiger left","observe tiger right"] # Observations
belief_initial = [0.5,0.5]
gamma = 0.85 # Discount factor
t = 5 # Horizon

In [None]:
# Transition probabilities
T =  np.array([[[0.5,0.5,1], # Tiger left -> Tiger left
               [0.5,0.5,0]], # Tiger left -> Tiger right
              [[0.5,0.5,0], # Tiger right -> Tiger left
              [0.5,0.5,1]], # Tiger right -> Tiger right
              ])

In [None]:
# Observation probabilities [observation][state][action]
O = np.array([[[0.5, 0.5, 0.85], # Tiger left -> Tiger left
               [0.5,0.5, 0.15]], # Tiger left -> Tiger right
              [[0.5,0.5, 0.15], # Tiger right -> Tiger left
              [0.5,0.5, 0.85]], # Tiger right -> Tiger right
             ])

In [None]:
# Reward
R = np.array([[-100,10,-1],[10,-100,-1]])
reward_list = []

# Class definition

In [None]:
class tiger_problem:

    def  __init__(self, horizon, states, actions, observations, belief_intial, gamma, tran_matrix, obs_matrix, R_func):
        self.horizon = horizon
        self.states = states
        self.actions = actions
        self.observations = observations
        self.b_init = belief_initial
        self.tran_matrix = tran_matrix
        self.obs_matrix = obs_matrix
        self.R_func = R_func
        self.gamma = gamma

    # Function to update belief
    def belief_update(self, action, obs, b):
        b_new = [0,0]
        for sj in range(len(states)):
            pr_obs = self.obs_matrix[obs][sj][action]
            summation = 0.0
            for si in range(len(states)):
                pr_s_prime = self.tran_matrix[sj][si][action]
                summation += pr_s_prime * b[si]
            b_new[sj] = (pr_obs * summation)

        # Normalising to ensure probabilities sum to 1
        total = sum(b_new)
        b_new = [x / total for x in b_new]
        print("New belief: ",b_new)
        return b_new

    # Value Iteration
    def value(self,b,k):
        if k == self.horizon:
            return 0
        if (k < self.horizon):
            Vmax = -10000
            rw_exp_vec = np.matmul(b,R)
            for a in range(len(actions)):
                value_future = 0
                print("Action: ",a)
                rw_exp = rw_exp_vec[a]
                print("Expected reward: ", rw_exp)
                reward_list.append(rw_exp)
                for obs in range(len(observations)):
                    b_new = self.belief_update(a, obs, b)
                    v_b_new=self.value(b_new, k+1)
                    for sj in range(len(states)):
                        for si in range(len(states)):
                            value_future += b[si]*self.tran_matrix[sj][si][a]*self.obs_matrix[obs][si][a]*v_b_new
                Value = (rw_exp + gamma*value_future)
                print("Depth: ", k, " Action: ", actions[a])
                print("Value: ", Value, " Reward: ", rw_exp, " Next value: ", value_future)
                print("********************************************************************")

                if Value > Vmax:
                    a_max = a
                    Vmax = Value

        print()
        print("Depth: ", k, " Action: ", actions[a])
        print("V_max: ", Vmax)
        print("Optimal action: ", actions[a_max])
        return Vmax

# Evaluation

In [None]:
tiger = tiger_problem(t,states,actions,observations,belief_initial,gamma,T,O,R)
print(tiger.value(belief_initial,0))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Expected reward:  -1.0
New belief:  [0.85, 0.15]
New belief:  [0.15, 0.85]
Depth:  4  Action:  Listen
Value:  -1.0  Reward:  -1.0  Next value:  0.0
********************************************************************

Depth:  4  Action:  Listen
V_max:  -1.0
Optimal action:  Listen
Depth:  3  Action:  Open left
Value:  -45.85  Reward:  -45.0  Next value:  -1.0
********************************************************************
Action:  1
Expected reward:  -45.0
New belief:  [0.5, 0.5]
Action:  0
Expected reward:  -45.0
New belief:  [0.5, 0.5]
New belief:  [0.5, 0.5]
Depth:  4  Action:  Open left
Value:  -45.0  Reward:  -45.0  Next value:  0.0
********************************************************************
Action:  1
Expected reward:  -45.0
New belief:  [0.5, 0.5]
New belief:  [0.5, 0.5]
Depth:  4  Action:  Open right
Value:  -45.0  Reward:  -45.0  Next value:  0.0
*****************************************************

In [None]:
reward_list[0]

-45.0

In [None]:
reward_list[-1]

-0.9999999999999999