In [4]:
import numpy as np
import pandas as pd


alpha = 0.8     # probability of staying in high after search
beta = 0.6      # probability of staying in low after search
gamma = 0.9     # discount factor


r_s = 1.0       # reward for search
r_w = 0.5       # reward for wait
r_fail = -3.0   # penalty when search fails in low

# indexing the states
states = ['high', 'low']
state_idx = {'high': 0, 'low': 1}
n_states = len(states)

# actions
actions = ['search', 'wait', 'recharge']
action_idx = {'search': 0, 'wait': 1, 'recharge': 2}

# available actions in each state
available_actions = {
    'high': ['search', 'wait'],
    'low': ['search', 'wait', 'recharge']
}

# Value Iteration
def value_iteration(threshold=1e-5, max_iterations=1000):
    V = np.zeros(n_states)
    policy = np.zeros(n_states, dtype=int)

    for _ in range(max_iterations):
        delta = 0
        V_new = np.copy(V)
        for state in states:
            s = state_idx[state]
            action_values = []
            for action in available_actions[state]:
                if state == 'high':
                    if action == 'search':
                        val = r_s + gamma * (alpha * V[state_idx['high']] + (1 - alpha) * V[state_idx['low']])
                    elif action == 'wait':
                        val = r_w + gamma * V[state_idx['high']]
                elif state == 'low':
                    if action == 'search':
                        val = beta * (r_s + gamma * V[state_idx['low']]) + (1 - beta) * (r_fail + gamma * V[state_idx['high']])
                    elif action == 'wait':
                        val = r_w + gamma * V[state_idx['low']]
                    elif action == 'recharge':
                        val = gamma * V[state_idx['high']]
                action_values.append(val)
            best_value = max(action_values)
            best_action = np.argmax(action_values)
            V_new[s] = best_value
            policy[s] = best_action
            delta = max(delta, abs(V[s] - best_value))
        V = V_new
        if delta < threshold:
            break
    return V, policy

# Policy Iteration
def compute_action_value(state, action, V):
    if state == 'high':
        if action == 'search':
            return r_s + gamma * (alpha * V[state_idx['high']] + (1 - alpha) * V[state_idx['low']])
        elif action == 'wait':
            return r_w + gamma * V[state_idx['high']]
    elif state == 'low':
        if action == 'search':
            return beta * (r_s + gamma * V[state_idx['low']]) + (1 - beta) * (r_fail + gamma * V[state_idx['high']])
        elif action == 'wait':
            return r_w + gamma * V[state_idx['low']]
        elif action == 'recharge':
            return gamma * V[state_idx['high']]
    return 0


def policy_iteration(threshold=1e-5, max_iterations=1000):

    policy = [0, 0]
    V = np.zeros(n_states)

    for _ in range(max_iterations):

        while True:
            delta = 0
            V_new = np.copy(V)
            for state in states:
                s = state_idx[state]
                a = policy[s]
                action = available_actions[state][a]
                V_new[s] = compute_action_value(state, action, V)
                delta = max(delta, abs(V[s] - V_new[s]))
            V = V_new
            if delta < threshold:
                break

        # to improve policy
        policy_stable = True
        for state in states:
            s = state_idx[state]
            old_action = policy[s]
            action_values = [compute_action_value(state, a, V) for a in available_actions[state]]
            best_action = np.argmax(action_values)
            policy[s] = best_action
            if old_action != best_action:
                policy_stable = False

        if policy_stable:
            break

    return V, policy

# value iteration
V_star, policy_star = value_iteration()


df = pd.DataFrame({
    "State": states,
    "Optimal Value": V_star,
    "Optimal Action": [available_actions[state][a] for state, a in zip(states, policy_star)]
})
print("Using Value Iteration")
print(df)


# policy iteration
V_pi, policy_pi = policy_iteration()

df_pi = pd.DataFrame({
    "State": states,
    "Optimal Value": V_pi,
    "Optimal Action": [available_actions[state][a] for state, a in zip(states, policy_pi)]
})
print("Using Policy Iteration")
print(df_pi)


Using Value Iteration
  State  Optimal Value Optimal Action
0  high       8.474492         search
1   low       7.627034       recharge
Using Policy Iteration
  State  Optimal Value Optimal Action
0  high       8.474490         search
1   low       7.627032       recharge
