In [5]:
import numpy as np

# Define the states and actions
states = ['Hostel', 'Academic Building', 'Canteen']
actions = ['Attend Class', 'Eat Food']

# Define rewards
rewards = {
    'Hostel': -1,
    'Academic Building': 3,
    'Canteen': 1
}

# Define transition probabilities
transition_probs = {
    ('Hostel', 'Attend Class', 'Academic Building'): 0.50,
    ('Hostel', 'Attend Class', 'Hostel'): 0.50,
    ('Hostel', 'Eat Food', 'Canteen'): 1.00,
    ('Academic Building', 'Attend Class', 'Academic Building'): 0.70,
    ('Academic Building', 'Attend Class', 'Canteen'): 0.30,
    ('Academic Building', 'Eat Food', 'Canteen'): 0.80,
    ('Academic Building', 'Eat Food', 'Academic Building'): 0.20,
    ('Canteen', 'Attend Class', 'Academic Building'): 0.60,
    ('Canteen', 'Attend Class', 'Hostel'): 0.30,
    ('Canteen', 'Attend Class', 'Canteen'): 0.10,
    ('Canteen', 'Eat Food', 'Canteen'): 1.00
}

# Initialize parameters
gamma = 0.9  # Discount factor
epsilon = 1e-6  # Convergence threshold
V = {state: 0 for state in states}  # Initialize value function

# Value Iteration
def value_iteration():
    global V
    while True:
        delta = 0
        new_V = V.copy()
        for s in states:
            v = V[s]
            new_V[s] = max(sum(transition_probs.get((s, a, s_prime), 0) *
                               (rewards[s_prime] + gamma * V[s_prime])
                               for s_prime in states)
                           for a in actions)
            delta = max(delta, abs(v - new_V[s]))
        V = new_V
        if delta < epsilon:
            break

value_iteration()
print("Optimal Values and Policy With Value Iteration")
print("Optimal Values :")
for s in states:
    print(f"{s}: {V[s]}")

# Derive Policy
def get_policy():
    policy = {}
    for s in states:
        policy[s] = max(actions, key=lambda a: sum(transition_probs.get((s, a, s_prime), 0) *
                                                      (rewards[s_prime] + gamma * V[s_prime])
                                                      for s_prime in states))
    return policy

policy = get_policy()
print("\nOptimal Policy:")
for s in states:
    print(f"{s}: {policy[s]}")


Optimal Values and Policy With Value Iteration
Optimal Values :
Hostel: 18.951369393117425
Academic Building: 20.94056457250515
Canteen: 19.807443641628296

Optimal Policy:
Hostel: Attend Class
Academic Building: Attend Class
Canteen: Attend Class


In [6]:
import numpy as np

# Initialize parameters
gamma = 0.9  # Discount factor
epsilon = 1e-6  # Convergence threshold
policy = {s: 'Attend Class' for s in states}  # Initialize random policy
V = {state: 0 for state in states}  # Initialize value function

# Policy Evaluation
def policy_evaluation():
    global V
    while True:
        delta = 0
        new_V = V.copy()
        for s in states:
            v = V[s]
            a = policy[s]
            new_V[s] = sum(transition_probs.get((s, a, s_prime), 0) *
                           (rewards[s_prime] + gamma * V[s_prime])
                           for s_prime in states)
            delta = max(delta, abs(v - new_V[s]))
        V = new_V
        if delta < epsilon:
            break

# Policy Improvement
def policy_improvement():
    global policy
    stable = True
    for s in states:
        old_action = policy[s]
        policy[s] = max(actions, key=lambda a: sum(transition_probs.get((s, a, s_prime), 0) *
                                                    (rewards[s_prime] + gamma * V[s_prime])
                                                    for s_prime in states))
        if old_action != policy[s]:
            stable = False
    return stable

# Policy Iteration
def policy_iteration():
    while True:
        policy_evaluation()
        if policy_improvement():
            break

policy_iteration()
print("Optimal Values and Policy With policy Iteration")
print("Optimal Values :")

for s in states:
    print(f"{s}: {V[s]}")

print("\nOptimal Policy:")
for s in states:
    print(f"{s}: {policy[s]}")


Optimal Values and Policy With policy Iteration
Optimal Values :
Hostel: 18.951369393117425
Academic Building: 20.94056457250515
Canteen: 19.807443641628296

Optimal Policy:
Hostel: Attend Class
Academic Building: Attend Class
Canteen: Attend Class
