In [1]:
import numpy as np

# Define the MDP
states = ['H', 'A', 'C']
actions = ['Stay', 'Move to A', 'Move to H', 'Move to C']

# Transition probabilities and rewards
transition_probs = {
    'H': {'Move to A': [('A', 0.5), ('H', 0.5)], 'Move to C': [('C', 1.0)], 'Stay': [('H', 1.0)]},
    'A': {'Stay': [('A', 0.7), ('C', 0.3)], 'Move to C': [('C', 0.8), ('A', 0.2)]},
    'C': {'Move to A': [('A', 0.6), ('H', 0.3), ('C', 0.1)], 'Stay': [('C', 1.0)]},
}

rewards = {
    'H': -1,
    'A': 3,
    'C': 1,
}

gamma = 0.9  # Discount factor
threshold = 0.001  # Convergence threshold

def value_iteration():
    V = {state: 0 for state in states}
    policy = {state: None for state in states}
    
    while True:
        delta = 0
        for state in states:
            max_value = float('-inf')
            best_action = None
            for action in transition_probs[state]:
                expected_value = sum(prob * (rewards[next_state] + gamma * V[next_state])
                                     for next_state, prob in transition_probs[state][action])
                if expected_value > max_value:
                    max_value = expected_value
                    best_action = action
            delta = max(delta, abs(V[state] - max_value))
            V[state] = max_value
            policy[state] = best_action
        if delta < threshold:
            break
    
    return V, policy

def policy_iteration():
    policy = {state: np.random.choice(list(transition_probs[state].keys())) for state in states}
    V = {state: 0 for state in states}
    
    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for state in states:
                action = policy[state]
                expected_value = sum(prob * (rewards[next_state] + gamma * V[next_state])
                                     for next_state, prob in transition_probs[state][action])
                delta = max(delta, abs(V[state] - expected_value))
                V[state] = expected_value
            if delta < threshold:
                break
        
        # Policy Improvement
        policy_stable = True
        for state in states:
            old_action = policy[state]
            max_value = float('-inf')
            best_action = None
            for action in transition_probs[state]:
                expected_value = sum(prob * (rewards[next_state] + gamma * V[next_state])
                                     for next_state, prob in transition_probs[state][action])
                if expected_value > max_value:
                    max_value = expected_value
                    best_action = action
            if best_action != old_action:
                policy_stable = True
            policy[state] = best_action
        
        if policy_stable:
            break
    
    return V, policy

# Run Value Iteration
V_vi, policy_vi = value_iteration()
print("Value Iteration - Optimal Value Function:")
print(V_vi)
print("Value Iteration - Optimal Policy:")
print(policy_vi)

# Run Policy Iteration
V_pi, policy_pi = policy_iteration()
print("\nPolicy Iteration - Optimal Value Function:")
print(V_pi)
print("Policy Iteration - Optimal Policy:")
print(policy_pi)


Value Iteration - Optimal Value Function:
{'H': 18.944752751756333, 'A': 20.934273596998825, 'C': 19.801668763463063}
Value Iteration - Optimal Policy:
{'H': 'Move to A', 'A': 'Stay', 'C': 'Move to A'}

Policy Iteration - Optimal Value Function:
{'H': 10.390518037482742, 'A': 10.479209833491613, 'C': 9.991404955442832}
Policy Iteration - Optimal Policy:
{'H': 'Move to A', 'A': 'Stay', 'C': 'Move to A'}
