In [1]:
import numpy as np

In [6]:
states = ['Hostel', 'Academic Building', 'Canteen']

actions = ['Attends Classes', 'Hungry']

rewards = {
    'Hostel': -1,
    'Academic Building': +3,
    'Canteen': +1
}

transition_probabilities = {
    'Hostel': {
        'Attends Classes': {'Academic Building': 0.5, 'Hostel': 0.5},
        'Hungry': {'Canteen': 1.0}
    },
    'Academic Building': {
        'Attends Classes': {'Academic Building': 0.7, 'Canteen': 0.3},
        'Hungry': {'Canteen': 0.8, 'Academic Building': 0.2}
    },
    'Canteen': {
        'Attends Classes': {'Academic Building': 0.6, 'Hostel': 0.3, 'Canteen': 0.1},
        'Hungry': {'Canteen': 1.0}
    }
}

gamma = 0.9

In [7]:
def value_iteration(states, actions, rewards, transition_probabilities, gamma=0.9, theta=1e-6):
    #Initialize value function
    V = {s: 0 for s in states}

    while True:
        delta = 0

        for s in states:
            v = V[s]

            V[s] = max(sum(transition_probabilities[s][a][s_prime] *
                           (rewards[s_prime] + gamma * V[s_prime])
                           for s_prime in transition_probabilities[s][a])
                       for a in actions)

            delta = max(delta, abs(v - V[s]))


        if delta < theta:
            break

    #Determine the optimal policy
    policy = {}
    for s in states:
        best_action = None
        best_value = float('-inf')

        for a in actions:
            action_value = sum(transition_probabilities[s][a][s_prime] *
                               (rewards[s_prime] + gamma * V[s_prime])
                               for s_prime in transition_probabilities[s][a])
            if action_value > best_value:
                best_value = action_value
                best_action = a

        policy[s] = best_action

    return V, policy

V, policy = value_iteration(states, actions, rewards, transition_probabilities, gamma)

In [11]:
print("The Optimal Value Functions are :")
for s in V:
    print(f"V of {s} = {V[s]:.2f}")

The Optimal Value Functions are :
V of Hostel = 18.95
V of Academic Building = 20.94
V of Canteen = 19.81


In [12]:
print("The Optimal Policy are:")
for s in policy:
    print(f"π of {s} = {policy[s]}")

The Optimal Policy are:
π of Hostel = Attends Classes
π of Academic Building = Attends Classes
π of Canteen = Attends Classes


In [13]:
def policy_evaluation(policy, states, actions, rewards, transition_probabilities, gamma=0.9, theta=1e-6):
    # Start with a zero value function for all states
    V = {s: 0 for s in states}

    while True:
        max_change = 0
        # Iterate through each state
        for s in states:
            previous_value = V[s]
            chosen_action = policy[s]

            V[s] = sum(
                transition_probabilities[s][chosen_action].get(s_prime, 0) *
                (rewards.get(s_prime, 0) + gamma * V[s_prime])
                for s_prime in transition_probabilities[s][chosen_action]
            )

            max_change = max(max_change, abs(previous_value - V[s]))

        if max_change < theta:
            break

    return V


In [15]:
def policy_iteration(states, actions, rewards, transition_probabilities, gamma=0.9):
    # Initialize a random policy
    policy = {s: np.random.choice(actions) for s in states}

    while True:
        # Policy Evaluation
        V = policy_evaluation(policy, states, actions, rewards, transition_probabilities, gamma)

        policy_stable = True

        # Policy Improvement
        for s in states:
            old_action = policy[s]

            best_action = None
            best_value = float('-inf')

            for a in actions:
                action_value = sum(transition_probabilities[s][a][s_prime] *
                                   (rewards[s_prime] + gamma * V[s_prime])
                                   for s_prime in transition_probabilities[s][a])
                if action_value > best_value:
                    best_value = action_value
                    best_action = a

            policy[s] = best_action

            if old_action != best_action:
                policy_stable = True

        if policy_stable:
            break

    return V, policy

V_policy, policy = policy_iteration(states, actions, rewards, transition_probabilities, gamma)

In [16]:
print("The Optimal Value Function (Policy Iteration):")
for s in V_policy:
    print(f"V of {s} = {V_policy[s]:.2f}")

The Optimal Value Function (Policy Iteration):
V of Hostel = 13.10
V of Academic Building = 13.78
V of Canteen = 10.00


In [17]:
print("The Optimal Policy (Policy Iteration):")
for s in policy:
    print(f"π of {s} = {policy[s]}")

The Optimal Policy (Policy Iteration):
π of Hostel = Attends Classes
π of Academic Building = Attends Classes
π of Canteen = Attends Classes
