## Question 1

Define the MDP

In [30]:
import numpy as np

# Define states
states = ["Hostel", "Academic Building", "Canteen"]

# Define actions
actions = ["Class", "Food"]

# Define rewards for each state
rewards = {
    "Hostel": -1,
    "Academic Building": 3,
    "Canteen": 1
}

# Define transition probabilities as a dictionary
transition_probabilities = {
    ("Hostel", "Class"): [("Hostel", 0.5), ("Academic Building", 0.5)],
    ("Hostel", "Food"): [("Canteen", 1.0)],
    ("Academic Building", "Class"): [("Academic Building", 0.7), ("Canteen", 0.3)],
    ("Academic Building", "Food"): [("Canteen", 0.8), ("Academic Building", 0.2)],
    ("Canteen", "Class"): [("Academic Building", 0.6), ("Hostel", 0.3), ("Canteen", 0.1)],
    ("Canteen", "Food"): [("Canteen", 1.0)]
}

# Discount factor
gamma = 0.9


Value iteration

In [31]:
# Initialize value function for each state to 0
V = {state: 0 for state in states}

# Threshold for convergence
theta = 1e-6

def value_iteration():
    while True:
        delta = 0
        for state in states:
            v = V[state]
            # Update the value function based on the maximum expected return
            V[state] = max(
                sum(prob * (rewards[state] + gamma * V[next_state])
                    for next_state, prob in transition_probabilities[(state, action)])
                for action in actions
            )
            # Track the maximum change across all states
            delta = max(delta, abs(v - V[state]))
        # Stop if the value function converges
        if delta < theta:
            break
    return V

# Run value iteration
optimal_values_vi = value_iteration()
print("Optimal Values:", optimal_values_vi)


Optimal Values: {'Hostel': 16.056233839779527, 'Academic Building': 21.846509827444127, 'Canteen': 18.826701506623575}


In [32]:
def extract_policy():
    policy = {}
    for state in states:
        # Choose the action that maximizes the expected return
        policy[state] = max(
            actions,
            key=lambda action: sum(prob * (rewards[state] + gamma * V[next_state])
                                   for next_state, prob in transition_probabilities[(state, action)])
        )
    return policy

# Extract the optimal policy
optimal_policy_vi = extract_policy()
print("Optimal Policy:", optimal_policy_vi)


Optimal Policy: {'Hostel': 'Class', 'Academic Building': 'Class', 'Canteen': 'Class'}


Policy iteration

In [33]:
# Initialize a random policy
policy = {state: np.random.choice(actions) for state in states}

def policy_evaluation(policy):
    while True:
        delta = 0
        for state in states:
            v = V[state]
            # Evaluate the policy by calculating the value for the chosen action
            action = policy[state]
            V[state] = sum(prob * (rewards[state] + gamma * V[next_state])
                           for next_state, prob in transition_probabilities[(state, action)])
            delta = max(delta, abs(v - V[state]))
        if delta < theta:
            break

def policy_improvement():
    stable = True
    for state in states:
        old_action = policy[state]
        # Choose the action that maximizes expected return
        policy[state] = max(
            actions,
            key=lambda action: sum(prob * (rewards[state] + gamma * V[next_state])
                                   for next_state, prob in transition_probabilities[(state, action)])
        )
        if old_action != policy[state]:
            stable = False
    return stable

def policy_iteration():
    while True:
        policy_evaluation(policy)
        if policy_improvement():
            break
    return policy

# Run policy iteration
optimal_policy = policy_iteration()
print("Optimal Policy (Policy Iteration):", optimal_policy)
print("Optimal Values (Policy Iteration):", V)


Optimal Policy (Policy Iteration): {'Hostel': 'Class', 'Academic Building': 'Class', 'Canteen': 'Class'}
Optimal Values (Policy Iteration): {'Hostel': 16.056233536108103, 'Academic Building': 21.846509538699056, 'Canteen': 18.826701241532977}
