# Import

In [None]:
import numpy as np

# Value Iteration

In [9]:
gamma = 0.9 # Otherwise it taking much time
threshold = 1e-6  

# Initializing the Value Function 
V = np.array([0.0, 0.0, 0.0])  

# Transition probability matrices for each action
P_attend_class = np.array([
    [0.5, 0.5, 0.0], 
    [0.0, 0.7, 0.3],  
    [0.3, 0.6, 0.1]  
])

P_eat_food = np.array([
    [0.0, 0.0, 1.0],  
    [0.2, 0.0, 0.8], 
    [0.0, 0.0, 1.0]  
])

# Rewards for each state
R = np.array([-1, 3, 1])  

# Perform Value Iteration
iteration = 0
while True:
    V_old = V.copy()  
    V_attend_class = np.sum(P_attend_class * (R + gamma * V), axis=1)
    V_eat_food = np.sum(P_eat_food * (R + gamma * V), axis=1)
    V = np.maximum(V_attend_class, V_eat_food)  

    # Check for convergence
    if np.max(np.abs(V - V_old)) < threshold:
        break
    iteration += 1

# Extract the optimal policy
policy = np.where(V_attend_class >= V_eat_food, 'Attend class', 'Eat food')

V, policy, iteration


(array([18.95136939, 20.94056457, 19.80744364]),
 array(['Attend class', 'Attend class', 'Attend class'], dtype='<U12'),
 138)

# Policy Iteration

In [12]:
# Initialize the policy randomly: starting with "Attend class" (0) for all states
policy = np.array([0, 0, 0])

V = np.array([0.0, 0.0, 0.0]) 

def policy_evaluation(policy, P_attend_class, P_eat_food, R, gamma, threshold):
    while True:
        V_old = V.copy()
        for s in range(3):
            if policy[s] == 0:  
                V[s] = np.sum(P_attend_class[s] * (R + gamma * V_old))
            else: 
                V[s] = np.sum(P_eat_food[s] * (R + gamma * V_old))
        
        # Check for convergence
        if np.max(np.abs(V - V_old)) < threshold:
            break
    return V

def policy_improvement(V, P_attend_class, P_eat_food, R, gamma):
    new_policy = policy.copy()
    for s in range(3):  
        V_attend_class = np.sum(P_attend_class[s] * (R + gamma * V))
        V_eat_food = np.sum(P_eat_food[s] * (R + gamma * V))
        new_policy[s] = 0 if V_attend_class >= V_eat_food else 1  
    return new_policy

iteration = 0
while True:
    V = policy_evaluation(policy, P_attend_class, P_eat_food, R, gamma, threshold)

    new_policy = policy_improvement(V, P_attend_class, P_eat_food, R, gamma)

    if np.array_equal(new_policy, policy):
        break
    policy = new_policy
    iteration += 1

policy_labels = np.where(policy == 0, 'Attend class', 'Eat food')
policy_labels, iteration


(array(['Attend class', 'Attend class', 'Attend class'], dtype='<U12'), 0)