In [1]:
def bellman_equation(state_values, rewards, transitions, gamma):
    """
    Computes the state values using the Bellman equation.

    Args:
        state_values (list): List of state values, one for each state.
        rewards (list): List of rewards, one for each state.
        transitions (list): List of transition dictionaries, one for each state.
            Each transition dictionary contains the next states and probabilities for each action.
        gamma (float): Discount factor.

    Returns:
        A list of updated state values.
    """
    new_state_values = []
    for state in range(len(state_values)):
        q_values = []
        for action in transitions[state]:
            next_states = transitions[state][action]
            q = 0
            for next_state in next_states:
                probability = next_states[next_state]
                reward = rewards[next_state]
                q += probability * (reward + gamma * state_values[next_state])
            q_values.append(q)
        new_state_values.append(max(q_values))
    return new_state_values

# Example usage
state_values = [0, 0, 0, 0]
rewards = [0, 0, 0, 100]
transitions = [
    {'a': {1: 0.5, 2: 0.5}},
    {'b': {3: 1}},
    {'c': {1: 0.1, 2: 0.9}},
    {'d': {3: 1}}
]
gamma = 0.9

for i in range(10):
    state_values = bellman_equation(state_values, rewards, transitions, gamma)
    print("Iteration", i+1, state_values)


Iteration 1 [0.0, 100.0, 0.0, 100.0]
Iteration 2 [45.0, 190.0, 9.0, 190.0]
Iteration 3 [89.55, 271.0, 24.39, 271.0]
Iteration 4 [132.9255, 343.9, 44.1459, 343.9]
Iteration 5 [174.620655, 409.51, 66.709179, 409.51]
Iteration 6 [214.29863055, 468.559, 90.89033499000001, 468.559]
Iteration 7 [251.7522007455, 521.7031, 115.79148134190002, 521.7031]
Iteration 8 [286.872561603855, 569.53279, 140.74437888693902, 569.53279]
Iteration 9 [319.6247259991226, 612.579511, 165.26089799842063, 612.579511]
Iteration 10 [350.0281840492893, 651.3215599, 188.99348336872072, 651.3215599]
