In [9]:
import numpy as np

In [33]:
# Create Colab Jupyter Notebook for our class example w/ Block Discount World:

# 5 State = {a, b, c, d, e}
# 3 Actions = {Left, Right, Exit}
# Exit available only in a & e.
# Exit from a yields reward of 10
# Exit from e yields reward of 1


# Actions(a) = exit
# ACtions(b) = left / right

states = ['a','b', 'c', 'd', 'e']
actions = ['left', 'right', 'exit']

rewards = {
    'a':{'exit': 10, 'left': 0, 'right': 0},
    'e':{'exit':1, "left":0, 'right':0},
    'b':{'left':0, 'right':0},
    'c':{'left':0, 'right':0},
    'd':{'left':0, 'right':0},
}

# Transition fucntion according to set rules
def transition(state, action):
    # Exit is the only aloud action in states a and e
    if state == 'a' and action == 'exit':
        return 'a'  # Exit state 'a'
    if state == 'e' and action == 'exit':
        return 'e'  # Exit state 'e'
    # All other state transitions
    if state=='b' and action=='left':
        return 'a'
    if state=='b' and action=='right':
        return 'c'
    if state=='c' and action=='left':
        return 'b'
    if state=='c' and action=='right':
        return 'd'
    if state=='d' and action=='left':
        return 'c'
    if state=='d' and action=='right':
        return 'e'
    # Capture all other cases
    return 'invalid'
    
    


In [47]:
# (1) Calculate Optimum Policy for cases: Transitions are deterministic, 𝛾=1, 𝛾=0.1

def optimum_policy(gamma, threshold=1e-6):
    V = {s: 0 for s in states}
    policy = {s: None for s in states}

    while True:
        delta = 0

        for s in states:
            action_values = []

            for a in actions:
                if a in rewards[s]:
                    next_state = transition(s, a)
                    if next_state == 'invalid':
                        continue
                    reward = rewards[s][a]
                    value = reward + gamma * V.get(next_state, 0)
                    action_values.append((value, a))  # Append as a tuple (value, action)

            if action_values:
                max_value, best_action = max(action_values, key=lambda x: x[0])  # Unpack the tuple
                delta = max(delta, abs(V[s] - max_value))
                V[s] = max_value
                policy[s] = best_action

        if delta < threshold:
            break

    return V, policy

In [48]:
# (2) Calculate the value of the sequence of rewards from each of the states under the optimum policy for both previous cases.

gamma_1 = 1.0
gamma_01 = 0.1

# Calculate value for gamma = 1
V_gamma_1, V_policy_1 = optimum_policy(gamma_1)
print("Values for gamma = 1:", V_gamma_1)

# Calculate value for gamma = 0.1
V_gamma_01, V_Policy_2 = optimum_policy(gamma_01)
print("Values for gamma = 0.1:", V_gamma_01)


KeyboardInterrupt: 

In [50]:
# (2) For which gamma are West and East equally good when in state d?
def compare_west_east(state, gamma):
    # Calculate values for going West (Left) and East (Right) in state `d`
    V = {s: 0 for s in states}
    
    next_state_west = transition(state, 'left')
    next_state_east = transition(state, 'right')
    
    value_west = rewards[state]['left'] + gamma * V[next_state_west]
    value_east = rewards[state]['right'] + gamma * V[next_state_east]
    
    return value_west, value_east

# Now let's find the gamma for which West and East are equally good
for gamma in np.linspace(0, 1, 100):
    value_west, value_east = compare_west_east('d', gamma)
    if np.isclose(value_west, value_east, atol=1e-2):
        print(f"West and East are equally good at gamma = {gamma}")
        break

West and East are equally good at gamma = 0.0
