In [1]:
import numpy as np


Defining MDP Components

In [3]:
# Define the states
states = ['Hostel', 'Academic Building', 'Canteen']

# Define the actions
actions = ['Attend Classes', 'Eat Food']

# Define the rewards
rewards = {
    'Hostel': -1,
    'Academic Building': 3,
    'Canteen': 1
}

# Define the transition probabilities as a dictionary
# P(s'|s,a)
transition_probabilities = {
    'Hostel': {
        'Attend Classes': {'Hostel': 0.5, 'Academic Building': 0.5},
        'Eat Food': {'Canteen': 1.0}
    },
    'Academic Building': {
        'Attend Classes': {'Academic Building': 0.7, 'Canteen': 0.3},
        'Eat Food': {'Canteen': 0.8, 'Academic Building': 0.2}
    },
    'Canteen': {
        'Attend Classes': {'Academic Building': 0.6, 'Hostel': 0.3, 'Canteen': 0.1},
        'Eat Food': {'Canteen': 1.0}
    }
}

# Discount factor
gamma = 0.9

Implementing Value Iteration

In [4]:
def value_iteration(states, actions, rewards, transition_probabilities, gamma=0.9, theta=1e-6):
    # Initialize value function for each state
    V = {s: 0 for s in states}
    
    while True:
        delta = 0
        # Loop over all states
        for s in states:
            v = V[s]
            # Update the value of state s
            V[s] = max(sum(transition_probabilities[s][a][s_prime] * 
                           (rewards[s_prime] + gamma * V[s_prime])
                           for s_prime in transition_probabilities[s][a])
                       for a in actions)
            # Check for convergence
            delta = max(delta, abs(v - V[s]))
        
        # If the change in value function is less than theta, break
        if delta < theta:
            break
    
    # Derive the policy
    policy = {}
    for s in states:
        best_action = None
        best_value = float('-inf')
        
        for a in actions:
            action_value = sum(transition_probabilities[s][a][s_prime] * 
                               (rewards[s_prime] + gamma * V[s_prime])
                               for s_prime in transition_probabilities[s][a])
            if action_value > best_value:
                best_value = action_value
                best_action = a
                
        policy[s] = best_action
        
    return V, policy


In [5]:
# Run value iteration
V, policy = value_iteration(states, actions, rewards, transition_probabilities, gamma)


In [8]:
print("Optimal Value Function:")
for s in V:
    print(f"V({s}) = {V[s]:.2f}")

Optimal Value Function:
V(Hostel) = 18.95
V(Academic Building) = 20.94
V(Canteen) = 19.81


In [9]:
print("\nOptimal Policy:")
for s in policy:
    print(f"π({s}) = {policy[s]}")


Optimal Policy:
π(Hostel) = Attend Classes
π(Academic Building) = Attend Classes
π(Canteen) = Attend Classes


Implementing Policy Iteration

In [10]:
def policy_evaluation(policy, states, actions, rewards, transition_probabilities, gamma=0.9, theta=1e-6):
    # Initialize value function for each state
    V = {s: 0 for s in states}
    
    while True:
        delta = 0
        # Loop over all states
        for s in states:
            v = V[s]
            # Update the value of state s using the policy
            a = policy[s]
            V[s] = sum(transition_probabilities[s][a][s_prime] * 
                       (rewards[s_prime] + gamma * V[s_prime])
                       for s_prime in transition_probabilities[s][a])
            # Check for convergence
            delta = max(delta, abs(v - V[s]))
        
        # If the change in value function is less than theta, break
        if delta < theta:
            break
    
    return V

In [11]:
def policy_iteration(states, actions, rewards, transition_probabilities, gamma=0.9):
    # Initialize a random policy
    policy = {s: np.random.choice(actions) for s in states}
    
    while True:
        # Policy Evaluation
        V = policy_evaluation(policy, states, actions, rewards, transition_probabilities, gamma)
        
        policy_stable = True
        
        # Policy Improvement
        for s in states:
            old_action = policy[s]
            
            best_action = None
            best_value = float('-inf')
            
            for a in actions:
                action_value = sum(transition_probabilities[s][a][s_prime] * 
                                   (rewards[s_prime] + gamma * V[s_prime])
                                   for s_prime in transition_probabilities[s][a])
                if action_value > best_value:
                    best_value = action_value
                    best_action = a
            
            policy[s] = best_action
            
            if old_action != best_action:
                policy_stable = True
        
        if policy_stable:
            break
    
    return V, policy

In [12]:
# Run policy iteration
V_policy, policy = policy_iteration(states, actions, rewards, transition_probabilities, gamma)


In [13]:
print("Optimal Value Function (Policy Iteration):")
for s in V_policy:
    print(f"V({s}) = {V_policy[s]:.2f}")

Optimal Value Function (Policy Iteration):
V(Hostel) = 10.00
V(Academic Building) = 13.78
V(Canteen) = 10.00


In [14]:
print("\nOptimal Policy (Policy Iteration):")
for s in policy:
    print(f"π({s}) = {policy[s]}")


Optimal Policy (Policy Iteration):
π(Hostel) = Attend Classes
π(Academic Building) = Attend Classes
π(Canteen) = Attend Classes
