Question 1

In [1]:
#Question 1


import numpy as np

#MDP components
states = ['Hostel_Attending_Classes', 'Hostel_Hungry', 'Academic_Building_Attending_Classes', 'Academic_Building_Hungry', 'Canteen_Attending_Classes', 'Canteen_Hungry']
actions = ['Attend_Class', 'Eat_Food']
num_states = len(states)
num_actions = len(actions)

#rewards
rewards = {'Hostel_Attending_Classes': -1,
    'Hostel_Hungry': -1,
    'Academic_Building_Attending_Classes': 3,
    'Academic_Building_Hungry': 3,
    'Canteen_Attending_Classes': 1,
    'Canteen_Hungry': 1}


#transition probabilities & rewards
transition_probabilities = {'Hostel_Attending_Classes': {'Attend_Class': [('Academic_Building_Attending_Classes', 0.5), ('Hostel_Attending_Classes', 0.5)], 'Eat_Food': [('Canteen_Hungry', 1.0)]},
    'Hostel_Hungry': {'Attend_Class': [('Academic_Building_Attending_Classes', 0.5), ('Hostel_Attending_Classes', 0.5)], 'Eat_Food': [('Canteen_Hungry', 1.0)]},
    'Academic_Building_Hungry': {'Attend_Class': [('Canteen_Attending_Classes', 0.8), ('Academic_Building_Hungry', 0.2)], 'Eat_Food': [('Canteen_Hungry', 1.0)]},
    'Academic_Building_Attending_Classes': {'Attend_Class': [('Canteen_Attending_Classes', 0.3), ('Hostel_Attending_Classes', 0.3), ('Academic_Building_Attending_Classes', 0.4)], 'Eat_Food': [('Canteen_Hungry', 1.0)]},
    'Canteen_Hungry': {'Attend_Class': [('Academic_Building_Attending_Classes', 0.6), ('Hostel_Attending_Classes', 0.3), ('Canteen_Hungry', 0.1)], 'Eat_Food': [('Canteen_Hungry', 1.0)]},
    'Canteen_Attending_Classes': {'Attend_Class': [('Academic_Building_Attending_Classes', 0.6), ('Hostel_Attending_Classes', 0.3), ('Canteen_Attending_Classes', 0.1)], 'Eat_Food': [('Canteen_Attending_Classes', 1.0)]}}

# Initialize Value Iteration
def value_iteration(states, actions, transition_probabilities, rewards, gamma=0.9, theta=1e-4):
    V = np.zeros(len(states))
    policy = np.zeros(len(states), dtype=int)
    state_index = {state: i for i, state in enumerate(states)}
    
    while True:
        delta = 0
        for s in states:
            v = V[state_index[s]]
            action_values = np.zeros(num_actions)
            for a in actions:
                action_index = actions.index(a)
                action_value = 0
                for next_state, prob in transition_probabilities[s][a]:
                    action_value += prob * (rewards[next_state] + gamma * V[state_index[next_state]])
                action_values[action_index] = action_value
            V[state_index[s]] = max(action_values)
            policy[state_index[s]] = np.argmax(action_values)
            delta = max(delta, abs(v - V[state_index[s]]))
        
        if delta < theta:
            break
    
    return V, policy

#Initialize Policy Iteration
def policy_iteration(states, actions, transition_probabilities, rewards, gamma=0.9, theta=1e-4):
    state_index = {state: i for i, state in enumerate(states)}
    policy = np.zeros(len(states), dtype=int)
    V = np.zeros(len(states))
    
    def policy_evaluation():
        while True:
            delta = 0
            for s in states:
                v = V[state_index[s]]
                action = actions[policy[state_index[s]]]
                action_value = 0
                for next_state, prob in transition_probabilities[s][action]:
                    action_value += prob * (rewards[next_state] + gamma * V[state_index[next_state]])
                V[state_index[s]] = action_value
                delta = max(delta, abs(v - V[state_index[s]]))
            if delta < theta:
                break
    
    def policy_improvement():
        policy_stable = True
        for s in states:
            old_action = policy[state_index[s]]
            action_values = np.zeros(num_actions)
            for a in actions:
                action_index = actions.index(a)
                action_value = 0
                for next_state, prob in transition_probabilities[s][a]:
                    action_value += prob * (rewards[next_state] + gamma * V[state_index[next_state]])
                action_values[action_index] = action_value
            best_action = np.argmax(action_values)
            policy[state_index[s]] = best_action
            if old_action != best_action:
                policy_stable = False
        return policy_stable
    
    while True:
        policy_evaluation()
        if policy_improvement():
            break
    
    return V, policy

#value Iteration
V_value, policy_value = value_iteration(states, actions, transition_probabilities, rewards)
print("Value Iteration Results:")
print("Values: ", V_value)
print("Policy: ", [actions[i] for i in policy_value])

#policy Iteration
V_policy, policy_policy = policy_iteration(states, actions, transition_probabilities, rewards)
print("\nPolicy Iteration Results:")
print("Values: ", V_policy)
print("Policy: ", [actions[i] for i in policy_policy])


Value Iteration Results:
Values:  [12.98306307 12.98306307 12.98306307 13.39809229 13.3145874  13.3145874 ]
Policy:  ['Eat_Food', 'Eat_Food', 'Eat_Food', 'Attend_Class', 'Attend_Class', 'Attend_Class']

Policy Iteration Results:
Values:  [12.98304403 12.98304403 12.98304403 13.39807276 13.31457007 13.31457007]
Policy:  ['Eat_Food', 'Eat_Food', 'Eat_Food', 'Attend_Class', 'Attend_Class', 'Attend_Class']
