In [2]:
import numpy as np

In [3]:
# Define the states and actions
states = ['a', 'b', 'c', 'd', 'e']
actions = ['Left', 'Right', 'Exit']

# Define rewards
rewards = {
    'a': {'Exit': 10},
    'b': {'Left': 0, 'Right': 0},
    'c': {'Left': 0, 'Right': 0},
    'd': {'Left': 0, 'Right': 0},
    'e': {'Exit': 1}
}


In [4]:
def transition(state, action):
    if state == 'a':
        if action == 'Exit':
            return None
        elif action == 'Right':
            return 'b'
    elif state == 'b':
        if action == 'Left':
            return 'a'
        elif action == 'Right':
            return 'c'
    elif state == 'c':
        if action == 'Left':
            return 'b'
        elif action == 'Right':
            return 'd'
    elif state == 'd':
        if action == 'Left':
            return 'c'
        elif action == 'Right':
            return 'e'
    elif state == 'e':
        if action == 'Exit':
            return None
        elif action == 'Left':
            return 'd'
    return state


In [5]:
def value_iteration(gamma, threshold=0.01):
    values = {state: 0 for state in states}
    policy = {state: None for state in states}

    while True:
        delta = 0
        new_values = values.copy()

        for state in states:
            if state == 'a' or state == 'e':
                action_values = {action: rewards[state].get(action, 0) for action in actions}
            else:
                action_values = {}
                for action in actions:
                    next_state = transition(state, action)
                    if next_state is not None:
                        action_values[action] = rewards[state].get(action, 0) + gamma * values[next_state]

            best_action = max(action_values, key=action_values.get)
            new_values[state] = action_values[best_action]
            policy[state] = best_action

            delta = max(delta, abs(new_values[state] - values[state]))

        values = new_values

        if delta < threshold:
            break

    return policy, values


In [6]:
gamma_values = [1, 0.1]
policies = {}
values = {}

for gamma in gamma_values:
    policy, value = value_iteration(gamma)
    policies[gamma] = policy
    values[gamma] = value

for gamma in gamma_values:
    print(f"Optimal Policy for γ={gamma}: {policies[gamma]}")
    print(f"State Values for γ={gamma}: {values[gamma]}")


Optimal Policy for γ=1: {'a': 'Exit', 'b': 'Left', 'c': 'Left', 'd': 'Left', 'e': 'Exit'}
State Values for γ=1: {'a': 10, 'b': 10, 'c': 10, 'd': 10, 'e': 1}
Optimal Policy for γ=0.1: {'a': 'Exit', 'b': 'Left', 'c': 'Left', 'd': 'Right', 'e': 'Exit'}
State Values for γ=0.1: {'a': 10, 'b': 1.0, 'c': 0.1, 'd': 0.1, 'e': 1}


In [7]:
def find_equal_gamma(threshold=0.001):
    low, high = 0, 1
    while high - low > threshold:
        gamma = (low + high) / 2
        policy, values = value_iteration(gamma)

        left_value = rewards['d'].get('Left', 0) + gamma * values[transition('d', 'Left')]
        right_value = rewards['d'].get('Right', 0) + gamma * values[transition('d', 'Right')]

        if abs(left_value - right_value) < threshold:
            return gamma
        elif left_value > right_value:
            high = gamma
        else:
            low = gamma

    return (low + high) / 2

gamma_equal = find_equal_gamma()
print(f"Gamma value where 'West' and 'East' are equally good in state 'd': {gamma_equal:.4f}")


Gamma value where 'West' and 'East' are equally good in state 'd': 0.3164
