In [7]:
import numpy as np

class MarkovDecisionProcess:
    def __init__(self, states, actions, transitions, rewards, gamma):
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma

    def get_transitions(self, state, action):
        return self.transitions[state][action]

    def get_reward(self, state, action):
        return self.rewards[state][action]

def perform_value_iteration(mdp, threshold=1e-6):
    state_values = {s: 0 for s in mdp.states}

    while True:
        max_change = 0
        for s in mdp.states:
            best_action_value = float('-inf')
            for a in mdp.actions:
                action_value = sum(prob * (mdp.get_reward(s, a) + mdp.gamma * state_values[next_s])
                                   for next_s, prob in mdp.get_transitions(s, a).items())
                best_action_value = max(best_action_value, action_value)

            max_change = max(max_change, abs(best_action_value - state_values[s]))
            state_values[s] = best_action_value

        if max_change < threshold:
            break

    policy = {}
    for s in mdp.states:
        best_action = None
        best_value = float('-inf')
        for a in mdp.actions:
            action_value = sum(prob * (mdp.get_reward(s, a) + mdp.gamma * state_values[next_s])
                               for next_s, prob in mdp.get_transitions(s, a).items())
            if action_value > best_value:
                best_value = action_value
                best_action = a
        policy[s] = best_action

    return state_values, policy

def evaluate_policy(policy, mdp, state_values, threshold=1e-6):
    while True:
        max_change = 0
        for s in mdp.states:
            chosen_action = policy[s]
            action_value = sum(prob * (mdp.get_reward(s, chosen_action) + mdp.gamma * state_values[next_s])
                               for next_s, prob in mdp.get_transitions(s, chosen_action).items())
            max_change = max(max_change, abs(action_value - state_values[s]))
            state_values[s] = action_value

        if max_change < threshold:
            break
    return state_values

def perform_policy_iteration(mdp):
    policy = {s: np.random.choice(mdp.actions) for s in mdp.states}
    state_values = {s: 0 for s in mdp.states}

    while True:
        state_values = evaluate_policy(policy, mdp, state_values)
        policy_stable = True

        for s in mdp.states:
            current_action = policy[s]
            action_evaluations = {}
            for a in mdp.actions:
                action_evaluations[a] = sum(prob * (mdp.get_reward(s, a) + mdp.gamma * state_values[next_s])
                                            for next_s, prob in mdp.get_transitions(s, a).items())
            optimal_action = max(action_evaluations, key=action_evaluations.get)

            if optimal_action != current_action:
                policy_stable = False

            policy[s] = optimal_action

        if policy_stable:
            break

    return state_values, policy

# Defining states, actions, and rewards
locations = ['Hostel', 'Academic-Building', 'Canteen']
options = ['attend', 'hungry']
probabilities = {
    'Hostel': {'attend': {'Hostel': 0.5, 'Academic-Building': 0.5}, 'hungry': {'Canteen': 1.0}},
    'Academic-Building': {'attend': {'Academic-Building': 0.7, 'Canteen': 0.3}, 'hungry': {'Academic-Building': 0.2, 'Canteen': 0.8}},
    'Canteen': {'attend': {'Hostel': 0.3, 'Academic-Building': 0.6, 'Canteen': 0.1}, 'hungry': {'Canteen': 1}}
}
rewards_map = {
    'Hostel': {'attend': -1, 'hungry': -1},
    'Academic-Building': {'attend': 3, 'hungry': 3},
    'Canteen': {'attend': 1, 'hungry': 1}
}
gamma_factor = 0.9

mdp_instance = MarkovDecisionProcess(locations, options, probabilities, rewards_map, gamma_factor)

value_function, optimal_policy = perform_policy_iteration(mdp_instance)
print("SOLUTION BY POLICY ITERATION:")
print("Optimal Value Function:", value_function)
print("Optimal Policy:", optimal_policy)
print("-------------------------------------")
value_function, optimal_policy = perform_value_iteration(mdp_instance)
print("SOLUTION BY VALUE ITERATION:")
print("Optimal Value Function:", value_function)
print("Optimal Policy:", optimal_policy)

SOLUTION BY POLICY ITERATION:
Optimal Value Function: {'Hostel': 16.056233584568062, 'Academic-Building': 21.846509584777067, 'Canteen': 18.826701283836204}
Optimal Policy: {'Hostel': 'attend', 'Academic-Building': 'attend', 'Canteen': 'attend'}
-------------------------------------
SOLUTION BY VALUE ITERATION:
Optimal Value Function: {'Hostel': 16.05623383977953, 'Academic-Building': 21.846509827444127, 'Canteen': 18.82670150662358}
Optimal Policy: {'Hostel': 'attend', 'Academic-Building': 'attend', 'Canteen': 'attend'}
