In [1]:
from value_iteration import ValueIteration

In [1]:
class ValueIteration:
    def __init__(self, mdp, discount=0.9, iterations=100):
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = {}

    def run_value_iteration(self):
        for i in range(self.iterations):
            updated_values = {}
            for state in self.mdp.get_all_states():
                if self.mdp.is_terminal(state):
                    updated_values[state] = self.mdp.get_reward(state)
                else:
                    updated_values[state] = max(
                        [
                            self.compute_qvalue_from_values(state, action)
                            for action in self.mdp.get_possible_actions(state)
                        ]
                    )
            self.values = updated_values

    def get_value(self, state):
        if state in self.values:
            return self.values[state]
        else:
            return 0

    def compute_qvalue_from_values(self, state, action):
        q_value = 0
        for next_state, probability in self.mdp.get_next_states(state, action).items():
            q_value += probability * (
                self.mdp.get_reward(state, action, next_state)
                + self.discount * self.get_value(next_state)
            )
        return q_value

    def compute_action_from_values(self, state):
        if self.mdp.is_terminal(state):
            return None
        else:
            actions = self.mdp.get_possible_actions(state)
            best_value, best_action = max(
                [
                    (self.compute_qvalue_from_values(state, action), action)
                    for action in actions
                ]
            )
            return best_action

    def get_action(self, state):
        return self.compute_action_from_values(state)

    def get_qvalue(self, state, action):
        return self.compute_qvalue_from_values(state, action)

    def get_policy(self, state):
        if self.mdp.is_terminal(state):
            return None
        else:
            return self.get_action(state)


In [2]:
mdp = [0,0,0,1]
discount = 0.9
iterations = 10

value = ValueIteration(mdp)
value.run_value_iteration()

AttributeError: 'list' object has no attribute 'get_all_states'

In [12]:
nos = 4  # no of states
A = ['l', 'r']  # actions
noa = 2

# R [from state][action]
R = [[-1, -1], [-1, -1], [-1, -1]]

# P [from state] [to state] [action]
P = [
    [[0.8, 0.2], [0.2, 0.8], [0, 0], [0, 0]],
    [[0.8, 0.2], [0, 0], [0.2, 0.8], [0, 0]],
    [[0, 0], [0.8, 0.2], [0, 0], [0.2, 0.8]],
]

delta = 0.01
gamma = 0.9
max_diff = 0

V = [[0,-100, -100, -100, 0],
    [1,0, 0, 0, 10],
    [0,-100, -100, -100, 0]]  # utilities of each state

print('Iteration', '0', '1', '2', '3', 'Maximum difference', sep="|")

for time in range(0, 30):
    print(time, V[0], V[1], V[2], V[3], max_diff, sep="|")
    Vnew = [[0,-100, -100, -100, 0],
    [1,0, 0, 0, 10],
    [0,-100, -100, -100, 0]]
    for i in range(3):
        for a in range(noa):
            cur_val = 0
            for j in range(nos):
                cur_val += P[i][j][a]*V[j]
            cur_val *= gamma
            cur_val += R[i][a]
            Vnew[i] = max(Vnew[i], cur_val)
    max_diff = 0
    for i in range(4):
        max_diff = max(max_diff, abs(V[i]-Vnew[i]))
    V = Vnew
    if(max_diff < delta):
        break

# one final iteration to determine the policy
Vnew = [-1e9, -1e9, -1e9, 10]
policy = ['NA', 'NA', 'NA', 'NA']
for i in range(3):
    for a in range(noa):
        cur_val = 0
        for j in range(nos):
            cur_val += P[i][j][a]*V[j]
        cur_val *= gamma
        cur_val += R[i][a]
        if(Vnew[i] < cur_val):
            policy[i] = A[a]
            Vnew[i] = max(Vnew[i], cur_val)
print("The policy is:", policy)

Iteration|0|1|2|3|Maximum difference


IndexError: list index out of range